{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.6078943749140421,
  "eval_steps": 500,
  "global_step": 8840,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "completion_length": 422.65,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 835.6,
      "completions/max_terminated_length": 738.6,
      "completions/mean_length": 422.65,
      "completions/mean_terminated_length": 395.875,
      "completions/min_length": 131.9,
      "completions/min_terminated_length": 131.9,
      "epoch": 0.0006876633200385092,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 2.2393298149108887,
      "kl": 1.021408660709858,
      "learning_rate": 4.722222222222222e-06,
      "loss": 0.001,
      "num_tokens": 26778.0,
      "reward": 4.25,
      "reward_std": 0.35469383001327515,
      "rewards/check_coherence/mean": 0.5625,
      "rewards/check_coherence/std": 0.3129152894020081,
      "rewards/check_response_quality/mean": 2.0875,
      "rewards/check_response_quality/std": 0.17355985641479493,
      "rewards/match_format_approximately/mean": 0.6,
      "rewards/match_format_approximately/std": 0.15773502588272095,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 10
    },
    {
      "completion_length": 334.6,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 824.9,
      "completions/max_terminated_length": 713.2,
      "completions/mean_length": 334.6,
      "completions/mean_terminated_length": 299.05833435058594,
      "completions/min_length": 50.2,
      "completions/min_terminated_length": 50.2,
      "epoch": 0.0013753266400770184,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 5.82886266708374,
      "kl": 1.5751606613397597,
      "learning_rate": 2.3750000000000003e-07,
      "loss": 0.0016,
      "num_tokens": 51474.0,
      "reward": 4.4375,
      "reward_std": 0.4622055947780609,
      "rewards/check_coherence/mean": 0.4875,
      "rewards/check_coherence/std": 0.33231321573257444,
      "rewards/check_response_quality/mean": 2.2125,
      "rewards/check_response_quality/std": 0.2633414089679718,
      "rewards/match_format_approximately/mean": 0.7375,
      "rewards/match_format_approximately/std": 0.21933756470680238,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 20
    },
    {
      "completion_length": 299.05,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 684.5,
      "completions/max_terminated_length": 582.1,
      "completions/mean_length": 299.05,
      "completions/mean_terminated_length": 268.775,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0020629899601155273,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6133402585983276,
      "kl": 1.2915641874074937,
      "learning_rate": 4.1428571428571435e-06,
      "loss": 0.0013,
      "num_tokens": 75404.0,
      "reward": 4.5625,
      "reward_std": 0.6107304871082306,
      "rewards/check_coherence/mean": 0.6125,
      "rewards/check_coherence/std": 0.403445702791214,
      "rewards/check_response_quality/mean": 2.2125,
      "rewards/check_response_quality/std": 0.2752987265586853,
      "rewards/match_format_approximately/mean": 0.7375,
      "rewards/match_format_approximately/std": 0.2404700517654419,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 30
    },
    {
      "completion_length": 347.025,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 862.6,
      "completions/max_terminated_length": 509.7,
      "completions/mean_length": 347.025,
      "completions/mean_terminated_length": 242.34166870117187,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.0027506532801540367,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 2.190549850463867,
      "kl": 0.8691788390278816,
      "learning_rate": 4.936507936507937e-06,
      "loss": 0.0009,
      "num_tokens": 102089.0,
      "reward": 4.775,
      "reward_std": 0.5918105363845825,
      "rewards/check_coherence/mean": 0.85,
      "rewards/check_coherence/std": 0.345650315284729,
      "rewards/check_response_quality/mean": 2.2,
      "rewards/check_response_quality/std": 0.20347774028778076,
      "rewards/match_format_approximately/mean": 0.725,
      "rewards/match_format_approximately/std": 0.16547005176544188,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 40
    },
    {
      "completion_length": 278.275,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 695.3,
      "completions/max_terminated_length": 556.3,
      "completions/mean_length": 278.275,
      "completions/mean_terminated_length": 240.375,
      "completions/min_length": 19.8,
      "completions/min_terminated_length": 19.8,
      "epoch": 0.0034383166001925457,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.1459013223648071,
      "kl": 0.7370456486940384,
      "learning_rate": 4.793650793650794e-06,
      "loss": 0.0007,
      "num_tokens": 125192.0,
      "reward": 4.7375,
      "reward_std": 0.9061064124107361,
      "rewards/check_coherence/mean": 0.8625,
      "rewards/check_coherence/std": 0.4861203670501709,
      "rewards/check_response_quality/mean": 2.175,
      "rewards/check_response_quality/std": 0.34289742112159727,
      "rewards/match_format_approximately/mean": 0.725,
      "rewards/match_format_approximately/std": 0.2654700517654419,
      "rewards/match_format_exactly/mean": 0.975,
      "rewards/match_format_exactly/std": 0.05,
      "step": 50
    },
    {
      "completion_length": 127.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.7,
      "completions/max_terminated_length": 249.7,
      "completions/mean_length": 127.625,
      "completions/mean_terminated_length": 127.625,
      "completions/min_length": 11.6,
      "completions/min_terminated_length": 11.6,
      "epoch": 0.004125979920231055,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 1.772578477859497,
      "kl": 1.2247508466243744,
      "learning_rate": 4.634920634920635e-06,
      "loss": 0.0012,
      "num_tokens": 140377.0,
      "reward": 5.075,
      "reward_std": 0.583526349067688,
      "rewards/check_coherence/mean": 0.925,
      "rewards/check_coherence/std": 0.4689477920532227,
      "rewards/check_response_quality/mean": 2.325,
      "rewards/check_response_quality/std": 0.17886751294136047,
      "rewards/match_format_approximately/mean": 0.825,
      "rewards/match_format_approximately/std": 0.17886751294136047,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 60
    },
    {
      "completion_length": 161.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 423.5,
      "completions/max_terminated_length": 423.5,
      "completions/mean_length": 161.4,
      "completions/mean_terminated_length": 161.4,
      "completions/min_length": 17.8,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.004813643240269564,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.128584861755371,
      "kl": 0.8261611372232437,
      "learning_rate": 4.476190476190477e-06,
      "loss": 0.0008,
      "num_tokens": 156317.0,
      "reward": 5.1625,
      "reward_std": 0.6853798747062683,
      "rewards/check_coherence/mean": 1.025,
      "rewards/check_coherence/std": 0.4077350258827209,
      "rewards/check_response_quality/mean": 2.3125,
      "rewards/check_response_quality/std": 0.21582483053207396,
      "rewards/match_format_approximately/mean": 0.825,
      "rewards/match_format_approximately/std": 0.2,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 70
    },
    {
      "completion_length": 120.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 248.3,
      "completions/max_terminated_length": 248.3,
      "completions/mean_length": 120.125,
      "completions/mean_terminated_length": 120.125,
      "completions/min_length": 16.5,
      "completions/min_terminated_length": 16.5,
      "epoch": 0.0055013065603080735,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 2.667182683944702,
      "kl": 0.8017235696315765,
      "learning_rate": 4.317460317460318e-06,
      "loss": 0.0008,
      "num_tokens": 171822.0,
      "reward": 5.4,
      "reward_std": 0.41746232509613035,
      "rewards/check_coherence/mean": 1.2625,
      "rewards/check_coherence/std": 0.2978713572025299,
      "rewards/check_response_quality/mean": 2.3125,
      "rewards/check_response_quality/std": 0.11582483053207397,
      "rewards/match_format_approximately/mean": 0.825,
      "rewards/match_format_approximately/std": 0.1,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 80
    },
    {
      "completion_length": 164.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 490.9,
      "completions/max_terminated_length": 490.9,
      "completions/mean_length": 164.875,
      "completions/mean_terminated_length": 164.875,
      "completions/min_length": 13.5,
      "completions/min_terminated_length": 13.5,
      "epoch": 0.006188969880346582,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8317680358886719,
      "kl": 0.8454075694084168,
      "learning_rate": 4.158730158730159e-06,
      "loss": 0.0008,
      "num_tokens": 189509.0,
      "reward": 5.325,
      "reward_std": 0.7779198408126831,
      "rewards/check_coherence/mean": 1.15,
      "rewards/check_coherence/std": 0.47320507764816283,
      "rewards/check_response_quality/mean": 2.3375,
      "rewards/check_response_quality/std": 0.21933756470680238,
      "rewards/match_format_approximately/mean": 0.8375,
      "rewards/match_format_approximately/std": 0.21933756470680238,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 90
    },
    {
      "completion_length": 128.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.5,
      "completions/max_terminated_length": 322.5,
      "completions/mean_length": 128.325,
      "completions/mean_terminated_length": 128.325,
      "completions/min_length": 46.3,
      "completions/min_terminated_length": 46.3,
      "epoch": 0.006876633200385091,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 0.003549874061718583,
      "kl": 0.820201675593853,
      "learning_rate": 4.000000000000001e-06,
      "loss": 0.0008,
      "num_tokens": 205998.0,
      "reward": 5.45,
      "reward_std": 0.5343478560447693,
      "rewards/check_coherence/mean": 1.225,
      "rewards/check_coherence/std": 0.3809401035308838,
      "rewards/check_response_quality/mean": 2.3625,
      "rewards/check_response_quality/std": 0.10386751294136047,
      "rewards/match_format_approximately/mean": 0.8625,
      "rewards/match_format_approximately/std": 0.10386751294136047,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 100
    },
    {
      "completion_length": 147.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 452.6,
      "completions/max_terminated_length": 452.6,
      "completions/mean_length": 147.575,
      "completions/mean_terminated_length": 147.575,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.007564296520423601,
      "frac_reward_zero_std": 0.1,
      "grad_norm": 2.094895124435425,
      "kl": 1.165158998966217,
      "learning_rate": 3.857142857142858e-06,
      "loss": 0.0012,
      "num_tokens": 224325.0,
      "reward": 5.3,
      "reward_std": 0.8748959302902222,
      "rewards/check_coherence/mean": 1.2125,
      "rewards/check_coherence/std": 0.38273502588272096,
      "rewards/check_response_quality/mean": 2.2875,
      "rewards/check_response_quality/std": 0.28898603916168214,
      "rewards/match_format_approximately/mean": 0.825,
      "rewards/match_format_approximately/std": 0.22320507764816283,
      "rewards/match_format_exactly/mean": 0.975,
      "rewards/match_format_exactly/std": 0.05,
      "step": 110
    },
    {
      "completion_length": 93.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 232.9,
      "completions/max_terminated_length": 232.9,
      "completions/mean_length": 93.475,
      "completions/mean_terminated_length": 93.475,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.00825195984046211,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 1.8320742845535278,
      "kl": 0.8784003466367721,
      "learning_rate": 3.6984126984126987e-06,
      "loss": 0.0009,
      "num_tokens": 238872.0,
      "reward": 5.6125,
      "reward_std": 0.5023834943771363,
      "rewards/check_coherence/mean": 1.3875,
      "rewards/check_coherence/std": 0.1978713572025299,
      "rewards/check_response_quality/mean": 2.3625,
      "rewards/check_response_quality/std": 0.18273502588272095,
      "rewards/match_format_approximately/mean": 0.8625,
      "rewards/match_format_approximately/std": 0.18273502588272095,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 120
    },
    {
      "completion_length": 63.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 180.7,
      "completions/max_terminated_length": 180.7,
      "completions/mean_length": 63.425,
      "completions/mean_terminated_length": 63.425,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.008939623160500619,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.388489007949829,
      "kl": 0.855793622136116,
      "learning_rate": 3.53968253968254e-06,
      "loss": 0.0009,
      "num_tokens": 252433.0,
      "reward": 5.75,
      "reward_std": 0.4457427144050598,
      "rewards/check_coherence/mean": 1.4,
      "rewards/check_coherence/std": 0.2,
      "rewards/check_response_quality/mean": 2.425,
      "rewards/check_response_quality/std": 0.12886751294136048,
      "rewards/match_format_approximately/mean": 0.925,
      "rewards/match_format_approximately/std": 0.12886751294136048,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 130
    },
    {
      "completion_length": 41.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.7,
      "completions/max_terminated_length": 127.7,
      "completions/mean_length": 41.35,
      "completions/mean_terminated_length": 41.35,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.009627286480539128,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.7645645141601562,
      "kl": 1.5349723994731903,
      "learning_rate": 3.3809523809523814e-06,
      "loss": 0.0015,
      "num_tokens": 264579.0,
      "reward": 5.0875,
      "reward_std": 1.1462337374687195,
      "rewards/check_coherence/mean": 1.15,
      "rewards/check_coherence/std": 0.48867512941360475,
      "rewards/check_response_quality/mean": 2.1875,
      "rewards/check_response_quality/std": 0.3914190471172333,
      "rewards/match_format_approximately/mean": 0.85,
      "rewards/match_format_approximately/std": 0.20773502588272094,
      "rewards/match_format_exactly/mean": 0.9,
      "rewards/match_format_exactly/std": 0.1154700517654419,
      "step": 140
    },
    {
      "completion_length": 46.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.3,
      "completions/max_terminated_length": 135.3,
      "completions/mean_length": 46.625,
      "completions/mean_terminated_length": 46.625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.010314949800577638,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 0.36925405263900757,
      "kl": 1.2759959518909454,
      "learning_rate": 3.2222222222222227e-06,
      "loss": 0.0013,
      "num_tokens": 278280.0,
      "reward": 5.7,
      "reward_std": 0.4686140716075897,
      "rewards/check_coherence/mean": 1.3875,
      "rewards/check_coherence/std": 0.225,
      "rewards/check_response_quality/mean": 2.4,
      "rewards/check_response_quality/std": 0.15173887014389037,
      "rewards/match_format_approximately/mean": 0.9125,
      "rewards/match_format_approximately/std": 0.13273502588272096,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 150
    },
    {
      "completion_length": 37.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 108.9,
      "completions/max_terminated_length": 108.9,
      "completions/mean_length": 37.85,
      "completions/mean_terminated_length": 37.85,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.011002613120616147,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.003718956606462598,
      "kl": 1.1356925666332245,
      "learning_rate": 3.063492063492064e-06,
      "loss": 0.0011,
      "num_tokens": 289698.0,
      "reward": 5.8625,
      "reward_std": 0.275,
      "rewards/check_coherence/mean": 1.4,
      "rewards/check_coherence/std": 0.2,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 160
    },
    {
      "completion_length": 24.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 58.6,
      "completions/max_terminated_length": 58.6,
      "completions/mean_length": 24.475,
      "completions/mean_terminated_length": 24.475,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.011690276440654656,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0026451745070517063,
      "kl": 1.1320037961006164,
      "learning_rate": 2.9047619047619053e-06,
      "loss": 0.0011,
      "num_tokens": 302601.0,
      "reward": 5.9,
      "reward_std": 0.14574271440505981,
      "rewards/check_coherence/mean": 1.425,
      "rewards/check_coherence/std": 0.10773502588272095,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 170
    },
    {
      "completion_length": 65.375,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 216.0,
      "completions/max_terminated_length": 67.0,
      "completions/mean_length": 65.375,
      "completions/mean_terminated_length": 26.866666793823242,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.012377939760693164,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 2.4312984943389893,
      "kl": 1.0950138330459596,
      "learning_rate": 2.7460317460317466e-06,
      "loss": 0.0011,
      "num_tokens": 317912.0,
      "reward": 5.775,
      "reward_std": 0.3957427144050598,
      "rewards/check_coherence/mean": 1.4,
      "rewards/check_coherence/std": 0.2,
      "rewards/check_response_quality/mean": 2.4375,
      "rewards/check_response_quality/std": 0.10386751294136047,
      "rewards/match_format_approximately/mean": 0.9375,
      "rewards/match_format_approximately/std": 0.10386751294136047,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 180
    },
    {
      "completion_length": 16.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 25.2,
      "completions/max_terminated_length": 25.2,
      "completions/mean_length": 16.275,
      "completions/mean_terminated_length": 16.275,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.013065603080731673,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.004265956114977598,
      "kl": 1.1890327751636505,
      "learning_rate": 2.587301587301588e-06,
      "loss": 0.0012,
      "num_tokens": 332055.0,
      "reward": 5.8875,
      "reward_std": 0.18273502588272095,
      "rewards/check_coherence/mean": 1.3875,
      "rewards/check_coherence/std": 0.18273502588272095,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 190
    },
    {
      "completion_length": 37.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 112.8,
      "completions/max_terminated_length": 112.8,
      "completions/mean_length": 37.9,
      "completions/mean_terminated_length": 37.9,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.013753266400770183,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 16.73357582092285,
      "kl": 1.1858276724815369,
      "learning_rate": 2.428571428571429e-06,
      "loss": 0.0012,
      "num_tokens": 344875.0,
      "reward": 5.75,
      "reward_std": 0.4457427144050598,
      "rewards/check_coherence/mean": 1.35,
      "rewards/check_coherence/std": 0.25773502588272096,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.1,
      "rewards/match_format_approximately/mean": 0.95,
      "rewards/match_format_approximately/std": 0.1,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 200
    },
    {
      "completion_length": 57.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 142.9,
      "completions/max_terminated_length": 142.9,
      "completions/mean_length": 57.225,
      "completions/mean_terminated_length": 57.225,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.014440929720808692,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.006797971669584513,
      "kl": 1.0238984107971192,
      "learning_rate": 2.26984126984127e-06,
      "loss": 0.001,
      "num_tokens": 359308.0,
      "reward": 5.7625,
      "reward_std": 0.2946484684944153,
      "rewards/check_coherence/mean": 1.375,
      "rewards/check_coherence/std": 0.20773502588272094,
      "rewards/check_response_quality/mean": 2.4375,
      "rewards/check_response_quality/std": 0.06582483053207397,
      "rewards/match_format_approximately/mean": 0.95,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 210
    },
    {
      "completion_length": 60.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 198.7,
      "completions/max_terminated_length": 198.7,
      "completions/mean_length": 60.225,
      "completions/mean_terminated_length": 60.225,
      "completions/min_length": 10.7,
      "completions/min_terminated_length": 10.7,
      "epoch": 0.015128593040847202,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 1.2490910291671753,
      "kl": 1.139722502231598,
      "learning_rate": 2.1111111111111114e-06,
      "loss": 0.0011,
      "num_tokens": 372089.0,
      "reward": 5.8,
      "reward_std": 0.4,
      "rewards/check_coherence/mean": 1.375,
      "rewards/check_coherence/std": 0.25,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 220
    },
    {
      "completion_length": 30.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 84.5,
      "completions/max_terminated_length": 84.5,
      "completions/mean_length": 30.075,
      "completions/mean_terminated_length": 30.075,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.01581625636088571,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.004948398098349571,
      "kl": 1.1260765612125396,
      "learning_rate": 1.9523809523809527e-06,
      "loss": 0.0011,
      "num_tokens": 384276.0,
      "reward": 5.875,
      "reward_std": 0.1728713572025299,
      "rewards/check_coherence/mean": 1.4375,
      "rewards/check_coherence/std": 0.0978713572025299,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 230
    },
    {
      "completion_length": 65.25,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 223.1,
      "completions/max_terminated_length": 65.9,
      "completions/mean_length": 65.25,
      "completions/mean_terminated_length": 25.908333396911623,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.01650391968092422,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.0049158609472215176,
      "kl": 1.044507622718811,
      "learning_rate": 1.7936507936507938e-06,
      "loss": 0.001,
      "num_tokens": 399686.0,
      "reward": 5.8375,
      "reward_std": 0.325,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4375,
      "rewards/check_response_quality/std": 0.125,
      "rewards/match_format_approximately/mean": 0.95,
      "rewards/match_format_approximately/std": 0.1,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 240
    },
    {
      "completion_length": 34.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 97.6,
      "completions/max_terminated_length": 97.6,
      "completions/mean_length": 34.25,
      "completions/mean_terminated_length": 34.25,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.01719158300096273,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.0035998751409351826,
      "kl": 1.131579464673996,
      "learning_rate": 1.6349206349206351e-06,
      "loss": 0.0011,
      "num_tokens": 412936.0,
      "reward": 5.875,
      "reward_std": 0.25,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 250
    },
    {
      "completion_length": 44.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 138.9,
      "completions/max_terminated_length": 138.9,
      "completions/mean_length": 44.525,
      "completions/mean_terminated_length": 44.525,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.017879246321001237,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.003912392072379589,
      "kl": 1.0597735822200776,
      "learning_rate": 1.4761904761904762e-06,
      "loss": 0.0011,
      "num_tokens": 427709.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 260
    },
    {
      "completion_length": 13.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.9,
      "completions/max_terminated_length": 22.9,
      "completions/mean_length": 13.175,
      "completions/mean_terminated_length": 13.175,
      "completions/min_length": 7.3,
      "completions/min_terminated_length": 7.3,
      "epoch": 0.01856690964103975,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 4.749101638793945,
      "kl": 1.352827501296997,
      "learning_rate": 1.3174603174603175e-06,
      "loss": 0.0014,
      "num_tokens": 437888.0,
      "reward": 5.7625,
      "reward_std": 0.41304759979248046,
      "rewards/check_coherence/mean": 1.4125,
      "rewards/check_coherence/std": 0.175,
      "rewards/check_response_quality/mean": 2.4125,
      "rewards/check_response_quality/std": 0.14464847445487977,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.053867512941360475,
      "rewards/match_format_exactly/mean": 0.975,
      "rewards/match_format_exactly/std": 0.05,
      "step": 270
    },
    {
      "completion_length": 29.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 75.4,
      "completions/max_terminated_length": 75.4,
      "completions/mean_length": 29.05,
      "completions/mean_terminated_length": 29.05,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.019254572961078256,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.0032444808166474104,
      "kl": 1.0743911743164063,
      "learning_rate": 1.1587301587301589e-06,
      "loss": 0.0011,
      "num_tokens": 450566.0,
      "reward": 5.8875,
      "reward_std": 0.225,
      "rewards/check_coherence/mean": 1.4375,
      "rewards/check_coherence/std": 0.125,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 280
    },
    {
      "completion_length": 35.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 100.7,
      "completions/max_terminated_length": 100.7,
      "completions/mean_length": 35.0,
      "completions/mean_terminated_length": 35.0,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.019942236281116764,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 4.597572326660156,
      "kl": 9.488562166690826,
      "learning_rate": 1.0000000000000002e-06,
      "loss": 0.0095,
      "num_tokens": 462366.0,
      "reward": 5.875,
      "reward_std": 0.20773502588272094,
      "rewards/check_coherence/mean": 1.4,
      "rewards/check_coherence/std": 0.15773502588272095,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 290
    },
    {
      "completion_length": 16.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.8,
      "completions/max_terminated_length": 31.8,
      "completions/mean_length": 16.875,
      "completions/mean_terminated_length": 16.875,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.020629899601155275,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0025285985320806503,
      "kl": 1.0927811563014984,
      "learning_rate": 8.412698412698414e-07,
      "loss": 0.0011,
      "num_tokens": 473501.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 300
    },
    {
      "completion_length": 34.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 104.8,
      "completions/max_terminated_length": 104.8,
      "completions/mean_length": 34.825,
      "completions/mean_terminated_length": 34.825,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.021317562921193783,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004729569889605045,
      "kl": 1.0428565800189973,
      "learning_rate": 6.825396825396826e-07,
      "loss": 0.001,
      "num_tokens": 487282.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 310
    },
    {
      "completion_length": 28.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 81.7,
      "completions/max_terminated_length": 81.7,
      "completions/mean_length": 28.475,
      "completions/mean_terminated_length": 28.475,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.022005226241232294,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 6.385797500610352,
      "kl": 1.2766858220100403,
      "learning_rate": 5.238095238095239e-07,
      "loss": 0.0013,
      "num_tokens": 500121.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 320
    },
    {
      "completion_length": 18.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 33.8,
      "completions/max_terminated_length": 33.8,
      "completions/mean_length": 18.75,
      "completions/mean_terminated_length": 18.75,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.0226928895612708,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.010602269321680069,
      "kl": 1.1651100397109986,
      "learning_rate": 3.6507936507936514e-07,
      "loss": 0.0012,
      "num_tokens": 514331.0,
      "reward": 5.8875,
      "reward_std": 0.18273502588272095,
      "rewards/check_coherence/mean": 1.4625,
      "rewards/check_coherence/std": 0.075,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.053867512941360475,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.053867512941360475,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 330
    },
    {
      "completion_length": 13.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.7,
      "completions/max_terminated_length": 20.7,
      "completions/mean_length": 13.8,
      "completions/mean_terminated_length": 13.8,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.023380552881309313,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007794269360601902,
      "kl": 1.1532993257045745,
      "learning_rate": 2.0634920634920635e-07,
      "loss": 0.0012,
      "num_tokens": 528547.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 340
    },
    {
      "completion_length": 13.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.0,
      "completions/max_terminated_length": 21.0,
      "completions/mean_length": 13.9,
      "completions/mean_terminated_length": 13.9,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.02406821620134782,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 3.0694658756256104,
      "kl": 1.2572884202003478,
      "learning_rate": 4.7619047619047627e-08,
      "loss": 0.0013,
      "num_tokens": 540139.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 350
    },
    {
      "completion_length": 15.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 28.5,
      "completions/max_terminated_length": 28.5,
      "completions/mean_length": 15.875,
      "completions/mean_terminated_length": 15.875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.024755879521386328,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.00623699277639389,
      "kl": 1.0916778862476348,
      "learning_rate": 1.588888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 553702.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 360
    },
    {
      "completion_length": 22.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 54.6,
      "completions/max_terminated_length": 54.6,
      "completions/mean_length": 22.4,
      "completions/mean_terminated_length": 22.4,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.02544354284142484,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.007368564605712891,
      "kl": 1.159883439540863,
      "learning_rate": 1.477777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 566342.0,
      "reward": 5.9,
      "reward_std": 0.2,
      "rewards/check_coherence/mean": 1.425,
      "rewards/check_coherence/std": 0.15,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 370
    },
    {
      "completion_length": 42.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 131.7,
      "completions/max_terminated_length": 131.7,
      "completions/mean_length": 42.2,
      "completions/mean_terminated_length": 42.2,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.026131206161463347,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 1.8988958597183228,
      "kl": 1.0139860570430757,
      "learning_rate": 1.3666666666666668e-06,
      "loss": 0.001,
      "num_tokens": 581202.0,
      "reward": 5.85,
      "reward_std": 0.3,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4375,
      "rewards/check_response_quality/std": 0.125,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 380
    },
    {
      "completion_length": 18.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 39.9,
      "completions/max_terminated_length": 39.9,
      "completions/mean_length": 18.65,
      "completions/mean_terminated_length": 18.65,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.026818869481501858,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.006959094665944576,
      "kl": 1.2067798852920533,
      "learning_rate": 1.2555555555555557e-06,
      "loss": 0.0012,
      "num_tokens": 593448.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 390
    },
    {
      "completion_length": 22.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 52.6,
      "completions/max_terminated_length": 52.6,
      "completions/mean_length": 22.5,
      "completions/mean_terminated_length": 22.5,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.027506532801540366,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0012704557739198208,
      "kl": 0.9775669932365417,
      "learning_rate": 1.1444444444444446e-06,
      "loss": 0.001,
      "num_tokens": 607732.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 400
    },
    {
      "completion_length": 38.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 123.2,
      "completions/max_terminated_length": 123.2,
      "completions/mean_length": 38.85,
      "completions/mean_terminated_length": 38.85,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.028194196121578877,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.004135606344789267,
      "kl": 1.141333144903183,
      "learning_rate": 1.0333333333333333e-06,
      "loss": 0.0011,
      "num_tokens": 620094.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.4625,
      "rewards/check_coherence/std": 0.075,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 410
    },
    {
      "completion_length": 33.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 96.7,
      "completions/max_terminated_length": 96.7,
      "completions/mean_length": 33.075,
      "completions/mean_terminated_length": 33.075,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.028881859441617384,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.01156042329967022,
      "kl": 1.4050394296646118,
      "learning_rate": 9.222222222222222e-07,
      "loss": 0.0014,
      "num_tokens": 633025.0,
      "reward": 5.875,
      "reward_std": 0.25,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 420
    },
    {
      "completion_length": 30.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 80.9,
      "completions/max_terminated_length": 80.9,
      "completions/mean_length": 30.55,
      "completions/mean_terminated_length": 30.55,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.029569522761655892,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 2.049229621887207,
      "kl": 0.9671716094017029,
      "learning_rate": 8.111111111111112e-07,
      "loss": 0.001,
      "num_tokens": 645383.0,
      "reward": 5.8625,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.05773502588272095,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.1,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 430
    },
    {
      "completion_length": 32.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 96.9,
      "completions/max_terminated_length": 96.9,
      "completions/mean_length": 32.375,
      "completions/mean_terminated_length": 32.375,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.030257186081694403,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00446512084454298,
      "kl": 1.2508110523223877,
      "learning_rate": 7.000000000000001e-07,
      "loss": 0.0013,
      "num_tokens": 659314.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 440
    },
    {
      "completion_length": 27.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 77.2,
      "completions/max_terminated_length": 77.2,
      "completions/mean_length": 27.725,
      "completions/mean_terminated_length": 27.725,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.03094484940173291,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.0031550778076052666,
      "kl": 1.1202711045742035,
      "learning_rate": 5.888888888888889e-07,
      "loss": 0.0011,
      "num_tokens": 671575.0,
      "reward": 5.8625,
      "reward_std": 0.275,
      "rewards/check_coherence/mean": 1.425,
      "rewards/check_coherence/std": 0.15,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 450
    },
    {
      "completion_length": 13.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.6,
      "completions/max_terminated_length": 21.6,
      "completions/mean_length": 13.225,
      "completions/mean_terminated_length": 13.225,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.03163251272177142,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025077220052480698,
      "kl": 1.1845026671886445,
      "learning_rate": 4.777777777777778e-07,
      "loss": 0.0012,
      "num_tokens": 680956.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 460
    },
    {
      "completion_length": 14.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.4,
      "completions/max_terminated_length": 22.4,
      "completions/mean_length": 14.7,
      "completions/mean_terminated_length": 14.7,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.03232017604180993,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0026869364082813263,
      "kl": 1.0701819598674773,
      "learning_rate": 3.666666666666667e-07,
      "loss": 0.0011,
      "num_tokens": 694668.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 470
    },
    {
      "completion_length": 26.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 74.9,
      "completions/max_terminated_length": 74.9,
      "completions/mean_length": 26.9,
      "completions/mean_terminated_length": 26.9,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.03300783936184844,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0027040443383157253,
      "kl": 1.3269443392753602,
      "learning_rate": 2.555555555555556e-07,
      "loss": 0.0013,
      "num_tokens": 707508.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 480
    },
    {
      "completion_length": 27.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 60.3,
      "completions/max_terminated_length": 60.3,
      "completions/mean_length": 27.025,
      "completions/mean_terminated_length": 27.025,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.033695502681886945,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0057538580149412155,
      "kl": 1.142683470249176,
      "learning_rate": 1.4444444444444445e-07,
      "loss": 0.0011,
      "num_tokens": 717845.0,
      "reward": 5.875,
      "reward_std": 0.13164966106414794,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.05773502588272095,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.053867512941360475,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.053867512941360475,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 490
    },
    {
      "completion_length": 17.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 36.0,
      "completions/max_terminated_length": 36.0,
      "completions/mean_length": 17.275,
      "completions/mean_terminated_length": 17.275,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.03438316600192546,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.005735357291996479,
      "kl": 1.0870766162872314,
      "learning_rate": 3.333333333333334e-08,
      "loss": 0.0011,
      "num_tokens": 729556.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 500
    },
    {
      "completion_length": 65.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.7,
      "completions/max_terminated_length": 135.7,
      "completions/mean_length": 65.975,
      "completions/mean_terminated_length": 65.975,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.03507082932196397,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 3.473423719406128,
      "kl": 1.3229190528392791,
      "learning_rate": 4.330917874396136e-06,
      "loss": 0.0013,
      "num_tokens": 740839.0,
      "reward": 5.8375,
      "reward_std": 0.153445702791214,
      "rewards/check_coherence/mean": 1.4125,
      "rewards/check_coherence/std": 0.13273502588272096,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.05773502588272095,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.028867512941360474,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 510
    },
    {
      "completion_length": 51.975,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 175.6,
      "completions/max_terminated_length": 19.1,
      "completions/mean_length": 51.975,
      "completions/mean_terminated_length": 12.7,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.035758492642002475,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005841626785695553,
      "kl": 3.1703031241893767,
      "learning_rate": 4.3067632850241545e-06,
      "loss": 0.0032,
      "num_tokens": 754322.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 520
    },
    {
      "completion_length": 26.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 69.3,
      "completions/max_terminated_length": 69.3,
      "completions/mean_length": 26.65,
      "completions/mean_terminated_length": 26.65,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.03644615596204098,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0034844442270696163,
      "kl": 1.1955626547336577,
      "learning_rate": 4.282608695652175e-06,
      "loss": 0.0012,
      "num_tokens": 768516.0,
      "reward": 5.9125,
      "reward_std": 0.12074271440505982,
      "rewards/check_coherence/mean": 1.4375,
      "rewards/check_coherence/std": 0.08273502588272094,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 530
    },
    {
      "completion_length": 49.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 158.8,
      "completions/max_terminated_length": 158.8,
      "completions/mean_length": 49.6,
      "completions/mean_terminated_length": 49.6,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.0371338192820795,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 2.5357494354248047,
      "kl": 1.0726108729839325,
      "learning_rate": 4.2584541062801936e-06,
      "loss": 0.0011,
      "num_tokens": 783124.0,
      "reward": 5.8125,
      "reward_std": 0.24571067690849305,
      "rewards/check_coherence/mean": 1.425,
      "rewards/check_coherence/std": 0.09082483053207398,
      "rewards/check_response_quality/mean": 2.4375,
      "rewards/check_response_quality/std": 0.10386751294136047,
      "rewards/match_format_approximately/mean": 0.95,
      "rewards/match_format_approximately/std": 0.07886751294136048,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 540
    },
    {
      "completion_length": 10.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.5,
      "completions/max_terminated_length": 13.5,
      "completions/mean_length": 10.4,
      "completions/mean_terminated_length": 10.4,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.037821482602118005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007602162193506956,
      "kl": 1.2256620168685912,
      "learning_rate": 4.234299516908213e-06,
      "loss": 0.0012,
      "num_tokens": 793232.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 550
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.03850914592215651,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005174298770725727,
      "kl": 1.4170250117778778,
      "learning_rate": 4.210144927536232e-06,
      "loss": 0.0014,
      "num_tokens": 804267.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 560
    },
    {
      "completion_length": 31.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 94.0,
      "completions/max_terminated_length": 94.0,
      "completions/mean_length": 31.3,
      "completions/mean_terminated_length": 31.3,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.03919680924219502,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 4.267595291137695,
      "kl": 1.1769875526428222,
      "learning_rate": 4.185990338164251e-06,
      "loss": 0.0012,
      "num_tokens": 817063.0,
      "reward": 5.875,
      "reward_std": 0.25,
      "rewards/check_coherence/mean": 1.4375,
      "rewards/check_coherence/std": 0.125,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 570
    },
    {
      "completion_length": 32.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 99.6,
      "completions/max_terminated_length": 99.6,
      "completions/mean_length": 32.925,
      "completions/mean_terminated_length": 32.925,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.03988447256223353,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 4.495794773101807,
      "kl": 1.3460102200508117,
      "learning_rate": 4.161835748792271e-06,
      "loss": 0.0013,
      "num_tokens": 830080.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.4375,
      "rewards/check_coherence/std": 0.125,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 580
    },
    {
      "completion_length": 64.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 202.7,
      "completions/max_terminated_length": 202.7,
      "completions/mean_length": 64.65,
      "completions/mean_terminated_length": 64.65,
      "completions/min_length": 7.1,
      "completions/min_terminated_length": 7.1,
      "epoch": 0.04057213588227204,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 2.4681625366210938,
      "kl": 1.3417995631694795,
      "learning_rate": 4.13768115942029e-06,
      "loss": 0.0013,
      "num_tokens": 843546.0,
      "reward": 5.725,
      "reward_std": 0.4217355728149414,
      "rewards/check_coherence/mean": 1.4375,
      "rewards/check_coherence/std": 0.125,
      "rewards/check_response_quality/mean": 2.3875,
      "rewards/check_response_quality/std": 0.16160253882408143,
      "rewards/match_format_approximately/mean": 0.925,
      "rewards/match_format_approximately/std": 0.1,
      "rewards/match_format_exactly/mean": 0.975,
      "rewards/match_format_exactly/std": 0.05,
      "step": 590
    },
    {
      "completion_length": 10.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 10.9,
      "completions/mean_terminated_length": 10.9,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.04125979920231055,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002406905870884657,
      "kl": 1.624447101354599,
      "learning_rate": 4.11352657004831e-06,
      "loss": 0.0016,
      "num_tokens": 854534.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 600
    },
    {
      "completion_length": 10.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 10.725,
      "completions/mean_terminated_length": 10.725,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.04194746252234906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003911010455340147,
      "kl": 1.4192960679531097,
      "learning_rate": 4.0893719806763285e-06,
      "loss": 0.0014,
      "num_tokens": 865203.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 610
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.9,
      "completions/max_terminated_length": 19.9,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.042635125842387565,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0038089642766863108,
      "kl": 1.4190670549869537,
      "learning_rate": 4.065217391304348e-06,
      "loss": 0.0014,
      "num_tokens": 876636.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 620
    },
    {
      "completion_length": 17.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.9,
      "completions/max_terminated_length": 31.9,
      "completions/mean_length": 17.1,
      "completions/mean_terminated_length": 17.1,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.04332278916242607,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004204964730888605,
      "kl": 1.2702333629131317,
      "learning_rate": 4.0410628019323675e-06,
      "loss": 0.0013,
      "num_tokens": 887312.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 630
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.04401045248246459,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.013301965780556202,
      "kl": 1.4880116164684296,
      "learning_rate": 4.016908212560387e-06,
      "loss": 0.0015,
      "num_tokens": 900155.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 640
    },
    {
      "completion_length": 11.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 11.975,
      "completions/mean_terminated_length": 11.975,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.044698115802503095,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003986676223576069,
      "kl": 1.375555509328842,
      "learning_rate": 3.9927536231884065e-06,
      "loss": 0.0014,
      "num_tokens": 913378.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 650
    },
    {
      "completion_length": 12.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.2,
      "completions/max_terminated_length": 20.2,
      "completions/mean_length": 12.375,
      "completions/mean_terminated_length": 12.375,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.0453857791225416,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0072197564877569675,
      "kl": 1.4778401851654053,
      "learning_rate": 3.968599033816425e-06,
      "loss": 0.0015,
      "num_tokens": 925801.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 660
    },
    {
      "completion_length": 10.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 10.1,
      "completions/mean_terminated_length": 10.1,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.04607344244258011,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00877147726714611,
      "kl": 1.5386210203170776,
      "learning_rate": 3.944444444444445e-06,
      "loss": 0.0015,
      "num_tokens": 938605.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 670
    },
    {
      "completion_length": 26.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 79.7,
      "completions/max_terminated_length": 79.7,
      "completions/mean_length": 26.9,
      "completions/mean_terminated_length": 26.9,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.046761105762618625,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0056604305282235146,
      "kl": 1.3617631494998932,
      "learning_rate": 3.920289855072464e-06,
      "loss": 0.0014,
      "num_tokens": 951505.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 680
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.04744876908265713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011466645635664463,
      "kl": 1.4758114337921142,
      "learning_rate": 3.896135265700484e-06,
      "loss": 0.0015,
      "num_tokens": 965291.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 690
    },
    {
      "completion_length": 11.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.9,
      "completions/max_terminated_length": 18.9,
      "completions/mean_length": 11.6,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.04813643240269564,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004098635632544756,
      "kl": 1.9950886607170104,
      "learning_rate": 3.871980676328503e-06,
      "loss": 0.002,
      "num_tokens": 980115.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 700
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.04882409572273415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038452409207820892,
      "kl": 1.3746231377124787,
      "learning_rate": 3.847826086956522e-06,
      "loss": 0.0014,
      "num_tokens": 992504.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 710
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.049511759042772656,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027600303292274475,
      "kl": 1.4807618498802184,
      "learning_rate": 3.8236714975845414e-06,
      "loss": 0.0015,
      "num_tokens": 1005663.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 720
    },
    {
      "completion_length": 12.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.2,
      "completions/max_terminated_length": 20.2,
      "completions/mean_length": 12.375,
      "completions/mean_terminated_length": 12.375,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.05019942236281117,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006777458358556032,
      "kl": 1.5830685496330261,
      "learning_rate": 3.7995169082125605e-06,
      "loss": 0.0016,
      "num_tokens": 1018350.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 730
    },
    {
      "completion_length": 12.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.05088708568284968,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003976897802203894,
      "kl": 1.3298546731472016,
      "learning_rate": 3.7753623188405805e-06,
      "loss": 0.0013,
      "num_tokens": 1030623.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 740
    },
    {
      "completion_length": 10.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 10.35,
      "completions/mean_terminated_length": 10.35,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.051574749002888186,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004374953452497721,
      "kl": 1.4491690695285797,
      "learning_rate": 3.7512077294685995e-06,
      "loss": 0.0014,
      "num_tokens": 1040569.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 750
    },
    {
      "completion_length": 13.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 23.8,
      "completions/max_terminated_length": 23.8,
      "completions/mean_length": 13.875,
      "completions/mean_terminated_length": 13.875,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.052262412322926693,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005473052617162466,
      "kl": 1.305114781856537,
      "learning_rate": 3.7270531400966186e-06,
      "loss": 0.0013,
      "num_tokens": 1051800.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 760
    },
    {
      "completion_length": 10.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 10.35,
      "completions/mean_terminated_length": 10.35,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.0529500756429652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00831334013491869,
      "kl": 1.5019812345504762,
      "learning_rate": 3.7028985507246377e-06,
      "loss": 0.0015,
      "num_tokens": 1063438.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 770
    },
    {
      "completion_length": 19.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.6,
      "completions/max_terminated_length": 49.6,
      "completions/mean_length": 19.4,
      "completions/mean_terminated_length": 19.4,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.053637738963003716,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005551347974687815,
      "kl": 1.6162667870521545,
      "learning_rate": 3.6787439613526572e-06,
      "loss": 0.0016,
      "num_tokens": 1075242.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 780
    },
    {
      "completion_length": 12.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.5,
      "completions/max_terminated_length": 18.5,
      "completions/mean_length": 12.1,
      "completions/mean_terminated_length": 12.1,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.05432540228304222,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006306353025138378,
      "kl": 1.3515527844429016,
      "learning_rate": 3.6545893719806768e-06,
      "loss": 0.0014,
      "num_tokens": 1089426.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 790
    },
    {
      "completion_length": 11.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.2,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.05501306560308073,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006242894567549229,
      "kl": 1.4019363343715667,
      "learning_rate": 3.6304347826086963e-06,
      "loss": 0.0014,
      "num_tokens": 1099998.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 800
    },
    {
      "completion_length": 12.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.5,
      "completions/max_terminated_length": 18.5,
      "completions/mean_length": 12.55,
      "completions/mean_terminated_length": 12.55,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.05570072892311924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006163584999740124,
      "kl": 1.3902488827705384,
      "learning_rate": 3.6062801932367154e-06,
      "loss": 0.0014,
      "num_tokens": 1111480.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 810
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.05638839224315775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006469434592872858,
      "kl": 1.574440735578537,
      "learning_rate": 3.5821256038647344e-06,
      "loss": 0.0016,
      "num_tokens": 1123270.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 820
    },
    {
      "completion_length": 11.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 11.225,
      "completions/mean_terminated_length": 11.225,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.05707605556319626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009432986378669739,
      "kl": 1.5892924427986146,
      "learning_rate": 3.5579710144927535e-06,
      "loss": 0.0016,
      "num_tokens": 1133435.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 830
    },
    {
      "completion_length": 14.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 27.8,
      "completions/max_terminated_length": 27.8,
      "completions/mean_length": 14.325,
      "completions/mean_terminated_length": 14.325,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.05776371888323477,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.005321599077433348,
      "kl": 1.4528217315673828,
      "learning_rate": 3.5338164251207735e-06,
      "loss": 0.0015,
      "num_tokens": 1145400.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 840
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.058451382203273276,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009609498083591461,
      "kl": 1.5806709051132202,
      "learning_rate": 3.5096618357487926e-06,
      "loss": 0.0016,
      "num_tokens": 1159344.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 850
    },
    {
      "completion_length": 11.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.0,
      "completions/mean_terminated_length": 11.0,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.059139045523311784,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030479375272989273,
      "kl": 1.3322069704532624,
      "learning_rate": 3.485507246376812e-06,
      "loss": 0.0013,
      "num_tokens": 1171028.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 860
    },
    {
      "completion_length": 12.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.6,
      "completions/max_terminated_length": 21.6,
      "completions/mean_length": 12.6,
      "completions/mean_terminated_length": 12.6,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.0598267088433503,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004412870854139328,
      "kl": 1.3979312300682067,
      "learning_rate": 3.461352657004831e-06,
      "loss": 0.0014,
      "num_tokens": 1184724.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 870
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.5,
      "completions/max_terminated_length": 18.5,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 7.4,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.060514372163388806,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006056824699044228,
      "kl": 1.382024598121643,
      "learning_rate": 3.4371980676328503e-06,
      "loss": 0.0014,
      "num_tokens": 1195929.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 880
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.061202035483427314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005581828765571117,
      "kl": 1.3684062242507935,
      "learning_rate": 3.4130434782608698e-06,
      "loss": 0.0014,
      "num_tokens": 1207208.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 890
    },
    {
      "completion_length": 12.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.8,
      "completions/max_terminated_length": 19.8,
      "completions/mean_length": 12.825,
      "completions/mean_terminated_length": 12.825,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.06188969880346582,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0034383470192551613,
      "kl": 1.2667877137660981,
      "learning_rate": 3.3888888888888893e-06,
      "loss": 0.0013,
      "num_tokens": 1220125.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 900
    },
    {
      "completion_length": 10.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 10.625,
      "completions/mean_terminated_length": 10.625,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.06257736212350433,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009170152246952057,
      "kl": 3.0979028224945067,
      "learning_rate": 3.3647342995169084e-06,
      "loss": 0.0031,
      "num_tokens": 1232450.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 910
    },
    {
      "completion_length": 10.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.425,
      "completions/mean_terminated_length": 10.425,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.06326502544354284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011768506839871407,
      "kl": 1.4945785045623778,
      "learning_rate": 3.340579710144928e-06,
      "loss": 0.0015,
      "num_tokens": 1244439.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 920
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.8,
      "completions/max_terminated_length": 19.8,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 7.4,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.06395268876358134,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005672871135175228,
      "kl": 1.5882929801940917,
      "learning_rate": 3.316425120772947e-06,
      "loss": 0.0016,
      "num_tokens": 1256788.0,
      "reward": 5.875,
      "reward_std": 0.25,
      "rewards/check_coherence/mean": 1.4625,
      "rewards/check_coherence/std": 0.075,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.1,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 0.975,
      "rewards/match_format_exactly/std": 0.05,
      "step": 930
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.06464035208361986,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008376842364668846,
      "kl": 1.31221883893013,
      "learning_rate": 3.292270531400966e-06,
      "loss": 0.0013,
      "num_tokens": 1268566.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 940
    },
    {
      "completion_length": 41.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 136.3,
      "completions/max_terminated_length": 136.3,
      "completions/mean_length": 41.625,
      "completions/mean_terminated_length": 41.625,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.06532801540365837,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0034077914897352457,
      "kl": 1.3816728472709656,
      "learning_rate": 3.268115942028986e-06,
      "loss": 0.0014,
      "num_tokens": 1280407.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 950
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.06601567872369687,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003883879631757736,
      "kl": 1.327423983812332,
      "learning_rate": 3.243961352657005e-06,
      "loss": 0.0013,
      "num_tokens": 1290632.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 960
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.06670334204373539,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005790382158011198,
      "kl": 1.360637903213501,
      "learning_rate": 3.219806763285024e-06,
      "loss": 0.0014,
      "num_tokens": 1303109.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 970
    },
    {
      "completion_length": 12.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.06739100536377389,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009436859749257565,
      "kl": 1.261417853832245,
      "learning_rate": 3.1956521739130437e-06,
      "loss": 0.0013,
      "num_tokens": 1315829.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 980
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.5,
      "completions/max_terminated_length": 17.5,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.0680786686838124,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.023381376639008522,
      "kl": 1.452295684814453,
      "learning_rate": 3.171497584541063e-06,
      "loss": 0.0015,
      "num_tokens": 1327605.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 990
    },
    {
      "completion_length": 10.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.9,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 10.85,
      "completions/mean_terminated_length": 10.85,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.06876633200385092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021764662116765976,
      "kl": 1.3832166135311126,
      "learning_rate": 3.1473429951690827e-06,
      "loss": 0.0014,
      "num_tokens": 1340799.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1000
    },
    {
      "completion_length": 11.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.1,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.06945399532388942,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027048310730606318,
      "kl": 1.2940570950508117,
      "learning_rate": 3.123188405797102e-06,
      "loss": 0.0013,
      "num_tokens": 1353767.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1010
    },
    {
      "completion_length": 25.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 72.7,
      "completions/max_terminated_length": 72.7,
      "completions/mean_length": 25.825,
      "completions/mean_terminated_length": 25.825,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.07014165864392793,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0019137284252792597,
      "kl": 1.2903836131095887,
      "learning_rate": 3.099033816425121e-06,
      "loss": 0.0013,
      "num_tokens": 1366024.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1020
    },
    {
      "completion_length": 11.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.07082932196396644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004841749556362629,
      "kl": 1.242459374666214,
      "learning_rate": 3.0748792270531404e-06,
      "loss": 0.0012,
      "num_tokens": 1377851.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1030
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.07151698528400495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00714073283597827,
      "kl": 1.2855034828186036,
      "learning_rate": 3.0507246376811595e-06,
      "loss": 0.0013,
      "num_tokens": 1389808.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1040
    },
    {
      "completion_length": 10.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.2,
      "completions/max_terminated_length": 12.2,
      "completions/mean_length": 10.125,
      "completions/mean_terminated_length": 10.125,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.07220464860404346,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00511928740888834,
      "kl": 1.4257906317710876,
      "learning_rate": 3.026570048309179e-06,
      "loss": 0.0014,
      "num_tokens": 1400881.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1050
    },
    {
      "completion_length": 22.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 61.2,
      "completions/max_terminated_length": 61.2,
      "completions/mean_length": 22.875,
      "completions/mean_terminated_length": 22.875,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.07289231192408197,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 3.7891669273376465,
      "kl": 1.345647120475769,
      "learning_rate": 3.0024154589371985e-06,
      "loss": 0.0013,
      "num_tokens": 1414468.0,
      "reward": 5.9,
      "reward_std": 0.2,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1060
    },
    {
      "completion_length": 9.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 9.65,
      "completions/mean_terminated_length": 9.65,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.07357997524412048,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.03541218861937523,
      "kl": 1.4368322610855102,
      "learning_rate": 2.9782608695652176e-06,
      "loss": 0.0014,
      "num_tokens": 1425650.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1070
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.074267638564159,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005114713683724403,
      "kl": 1.6687398076057434,
      "learning_rate": 2.9541062801932367e-06,
      "loss": 0.0017,
      "num_tokens": 1437255.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1080
    },
    {
      "completion_length": 20.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 52.2,
      "completions/max_terminated_length": 52.2,
      "completions/mean_length": 20.95,
      "completions/mean_terminated_length": 20.95,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.0749553018841975,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005882755853235722,
      "kl": 2.042826807498932,
      "learning_rate": 2.9299516908212562e-06,
      "loss": 0.002,
      "num_tokens": 1450757.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1090
    },
    {
      "completion_length": 10.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 10.225,
      "completions/mean_terminated_length": 10.225,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.07564296520423601,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006175467278808355,
      "kl": 1.8858429431915282,
      "learning_rate": 2.9057971014492758e-06,
      "loss": 0.0019,
      "num_tokens": 1465250.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1100
    },
    {
      "completion_length": 10.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.7,
      "completions/mean_terminated_length": 10.7,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.07633062852427451,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01212387252599001,
      "kl": 1.431203842163086,
      "learning_rate": 2.8816425120772953e-06,
      "loss": 0.0014,
      "num_tokens": 1476954.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1110
    },
    {
      "completion_length": 10.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.95,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.07701829184431302,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007129206787794828,
      "kl": 1.5234606921672822,
      "learning_rate": 2.8574879227053144e-06,
      "loss": 0.0015,
      "num_tokens": 1489896.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1120
    },
    {
      "completion_length": 9.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 11.9,
      "completions/max_terminated_length": 11.9,
      "completions/mean_length": 9.95,
      "completions/mean_terminated_length": 9.95,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.07770595516435154,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005986523348838091,
      "kl": 1.9270170032978058,
      "learning_rate": 2.8333333333333335e-06,
      "loss": 0.0019,
      "num_tokens": 1500578.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1130
    },
    {
      "completion_length": 14.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 25.3,
      "completions/max_terminated_length": 25.3,
      "completions/mean_length": 14.4,
      "completions/mean_terminated_length": 14.4,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.07839361848439004,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.010054023936390877,
      "kl": 1.3340931117534638,
      "learning_rate": 2.8091787439613525e-06,
      "loss": 0.0013,
      "num_tokens": 1512906.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1140
    },
    {
      "completion_length": 20.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.2,
      "completions/max_terminated_length": 49.2,
      "completions/mean_length": 20.15,
      "completions/mean_terminated_length": 20.15,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.07908128180442855,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0031711028423160315,
      "kl": 1.3866869628429412,
      "learning_rate": 2.7850241545893725e-06,
      "loss": 0.0014,
      "num_tokens": 1524192.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1150
    },
    {
      "completion_length": 13.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 23.6,
      "completions/max_terminated_length": 23.6,
      "completions/mean_length": 13.175,
      "completions/mean_terminated_length": 13.175,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.07976894512446706,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.01567765325307846,
      "kl": 1.2489181756973267,
      "learning_rate": 2.7608695652173916e-06,
      "loss": 0.0012,
      "num_tokens": 1536839.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1160
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.5,
      "completions/max_terminated_length": 18.5,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.08045660844450557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01604825258255005,
      "kl": 1.3663283944129945,
      "learning_rate": 2.736714975845411e-06,
      "loss": 0.0014,
      "num_tokens": 1550150.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1170
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.08114427176454408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025910416152328253,
      "kl": 1.2825160801410675,
      "learning_rate": 2.71256038647343e-06,
      "loss": 0.0013,
      "num_tokens": 1562550.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1180
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.08183193508458259,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024119107984006405,
      "kl": 1.2274268567562103,
      "learning_rate": 2.6884057971014493e-06,
      "loss": 0.0012,
      "num_tokens": 1575764.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1190
    },
    {
      "completion_length": 10.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 10.25,
      "completions/mean_terminated_length": 10.25,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.0825195984046211,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0072606573812663555,
      "kl": 1.4195207893848418,
      "learning_rate": 2.6642512077294684e-06,
      "loss": 0.0014,
      "num_tokens": 1589350.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1200
    },
    {
      "completion_length": 10.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.25,
      "completions/mean_terminated_length": 10.25,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.0832072617246596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019173582550138235,
      "kl": 1.3919144153594971,
      "learning_rate": 2.6400966183574883e-06,
      "loss": 0.0014,
      "num_tokens": 1602132.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1210
    },
    {
      "completion_length": 11.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.15,
      "completions/mean_terminated_length": 11.15,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.08389492504469812,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004679285455495119,
      "kl": 1.2476289927959443,
      "learning_rate": 2.6159420289855074e-06,
      "loss": 0.0012,
      "num_tokens": 1612766.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1220
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.08458258836473663,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002475257497280836,
      "kl": 1.4096310913562775,
      "learning_rate": 2.591787439613527e-06,
      "loss": 0.0014,
      "num_tokens": 1625160.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1230
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.08527025168477513,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00401447806507349,
      "kl": 1.3029845595359801,
      "learning_rate": 2.567632850241546e-06,
      "loss": 0.0013,
      "num_tokens": 1637763.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1240
    },
    {
      "completion_length": 16.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 39.6,
      "completions/max_terminated_length": 39.6,
      "completions/mean_length": 16.95,
      "completions/mean_terminated_length": 16.95,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.08595791500481365,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005188316572457552,
      "kl": 1.3585187375545502,
      "learning_rate": 2.543478260869565e-06,
      "loss": 0.0014,
      "num_tokens": 1649345.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1250
    },
    {
      "completion_length": 11.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.125,
      "completions/mean_terminated_length": 11.125,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.08664557832485215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.025956762954592705,
      "kl": 1.5334448099136353,
      "learning_rate": 2.519323671497585e-06,
      "loss": 0.0015,
      "num_tokens": 1661062.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1260
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.5,
      "completions/max_terminated_length": 16.5,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.08733324164489066,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004098770674318075,
      "kl": 1.2529529988765717,
      "learning_rate": 2.495169082125604e-06,
      "loss": 0.0013,
      "num_tokens": 1673301.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1270
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.08802090496492918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032903961837291718,
      "kl": 1.2799896121025085,
      "learning_rate": 2.471014492753623e-06,
      "loss": 0.0013,
      "num_tokens": 1685302.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1280
    },
    {
      "completion_length": 10.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.5,
      "completions/max_terminated_length": 13.5,
      "completions/mean_length": 10.35,
      "completions/mean_terminated_length": 10.35,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.08870856828496768,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00576725322753191,
      "kl": 1.3937119126319886,
      "learning_rate": 2.4468599033816427e-06,
      "loss": 0.0014,
      "num_tokens": 1697512.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1290
    },
    {
      "completion_length": 10.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.725,
      "completions/mean_terminated_length": 10.725,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.08939623160500619,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009266716428101063,
      "kl": 1.3509972453117371,
      "learning_rate": 2.4227053140096622e-06,
      "loss": 0.0014,
      "num_tokens": 1708789.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1300
    },
    {
      "completion_length": 10.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 10.9,
      "completions/mean_terminated_length": 10.9,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.09008389492504469,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004442331846803427,
      "kl": 1.4136140465736389,
      "learning_rate": 2.3985507246376813e-06,
      "loss": 0.0014,
      "num_tokens": 1721465.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1310
    },
    {
      "completion_length": 12.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.4,
      "completions/max_terminated_length": 21.4,
      "completions/mean_length": 12.85,
      "completions/mean_terminated_length": 12.85,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.0907715582450832,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00520370714366436,
      "kl": 1.6327512860298157,
      "learning_rate": 2.374396135265701e-06,
      "loss": 0.0016,
      "num_tokens": 1732827.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1320
    },
    {
      "completion_length": 11.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.45,
      "completions/mean_terminated_length": 11.45,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.09145922156512172,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.15331777930259705,
      "kl": 1.3064857959747314,
      "learning_rate": 2.35024154589372e-06,
      "loss": 0.0013,
      "num_tokens": 1744241.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1330
    },
    {
      "completion_length": 10.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.5,
      "completions/max_terminated_length": 12.5,
      "completions/mean_length": 10.125,
      "completions/mean_terminated_length": 10.125,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.09214688488516022,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01661134511232376,
      "kl": 1.4703512012958526,
      "learning_rate": 2.326086956521739e-06,
      "loss": 0.0015,
      "num_tokens": 1757074.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1340
    },
    {
      "completion_length": 12.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 12.15,
      "completions/mean_terminated_length": 12.15,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.09283454820519874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002100709592923522,
      "kl": 1.1705379903316497,
      "learning_rate": 2.3019323671497585e-06,
      "loss": 0.0012,
      "num_tokens": 1768636.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1350
    },
    {
      "completion_length": 15.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.0,
      "completions/max_terminated_length": 31.0,
      "completions/mean_length": 15.125,
      "completions/mean_terminated_length": 15.125,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.09352221152523725,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0030410720501095057,
      "kl": 1.1578946471214295,
      "learning_rate": 2.277777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 1781393.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1360
    },
    {
      "completion_length": 9.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.6,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 9.95,
      "completions/mean_terminated_length": 9.95,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.09420987484527575,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005324806552380323,
      "kl": 1.351249760389328,
      "learning_rate": 2.2536231884057976e-06,
      "loss": 0.0014,
      "num_tokens": 1794791.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1370
    },
    {
      "completion_length": 13.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.4,
      "completions/max_terminated_length": 22.4,
      "completions/mean_length": 13.075,
      "completions/mean_terminated_length": 13.075,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.09489753816531427,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004393610171973705,
      "kl": 1.335606962442398,
      "learning_rate": 2.2294685990338166e-06,
      "loss": 0.0013,
      "num_tokens": 1807182.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1380
    },
    {
      "completion_length": 10.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 10.375,
      "completions/mean_terminated_length": 10.375,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.09558520148535277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011187166906893253,
      "kl": 1.3981504678726195,
      "learning_rate": 2.2053140096618357e-06,
      "loss": 0.0014,
      "num_tokens": 1817889.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1390
    },
    {
      "completion_length": 11.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.125,
      "completions/mean_terminated_length": 11.125,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.09627286480539128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00982034020125866,
      "kl": 1.439992618560791,
      "learning_rate": 2.1811594202898552e-06,
      "loss": 0.0014,
      "num_tokens": 1830338.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1400
    },
    {
      "completion_length": 12.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.9,
      "completions/max_terminated_length": 20.9,
      "completions/mean_length": 12.575,
      "completions/mean_terminated_length": 12.575,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.0969605281254298,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037614547181874514,
      "kl": 1.459202778339386,
      "learning_rate": 2.1570048309178743e-06,
      "loss": 0.0015,
      "num_tokens": 1841853.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1410
    },
    {
      "completion_length": 11.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.7,
      "completions/max_terminated_length": 17.7,
      "completions/mean_length": 11.325,
      "completions/mean_terminated_length": 11.325,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.0976481914454683,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022658687084913254,
      "kl": 1.3880295991897582,
      "learning_rate": 2.132850241545894e-06,
      "loss": 0.0014,
      "num_tokens": 1853382.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1420
    },
    {
      "completion_length": 12.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.7,
      "completions/max_terminated_length": 18.7,
      "completions/mean_length": 12.575,
      "completions/mean_terminated_length": 12.575,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.09833585476550681,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0058380733244121075,
      "kl": 1.4471020340919494,
      "learning_rate": 2.1086956521739134e-06,
      "loss": 0.0014,
      "num_tokens": 1866417.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1430
    },
    {
      "completion_length": 11.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.0,
      "completions/mean_terminated_length": 11.0,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.09902351808554531,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01883215643465519,
      "kl": 1.319870752096176,
      "learning_rate": 2.0845410628019325e-06,
      "loss": 0.0013,
      "num_tokens": 1877421.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1440
    },
    {
      "completion_length": 13.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 23.6,
      "completions/max_terminated_length": 23.6,
      "completions/mean_length": 13.3,
      "completions/mean_terminated_length": 13.3,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.09971118140558383,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.100250482559204,
      "kl": 1.1677093982696534,
      "learning_rate": 2.060386473429952e-06,
      "loss": 0.0012,
      "num_tokens": 1890953.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1450
    },
    {
      "completion_length": 10.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.8,
      "completions/max_terminated_length": 12.8,
      "completions/mean_length": 10.3,
      "completions/mean_terminated_length": 10.3,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.10039884472562234,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003426405368372798,
      "kl": 1.348715353012085,
      "learning_rate": 2.036231884057971e-06,
      "loss": 0.0013,
      "num_tokens": 1902461.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1460
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.10108650804566084,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004393596667796373,
      "kl": 1.2271339356899262,
      "learning_rate": 2.0120772946859906e-06,
      "loss": 0.0012,
      "num_tokens": 1914978.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1470
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.10177417136569936,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 7.2538251876831055,
      "kl": 1.4542439758777619,
      "learning_rate": 1.98792270531401e-06,
      "loss": 0.0015,
      "num_tokens": 1927398.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1480
    },
    {
      "completion_length": 10.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.375,
      "completions/mean_terminated_length": 10.375,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.10246183468573786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007453908212482929,
      "kl": 1.3577909409999847,
      "learning_rate": 1.963768115942029e-06,
      "loss": 0.0014,
      "num_tokens": 1939537.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1490
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.10314949800577637,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.017898349091410637,
      "kl": 1.2097175359725951,
      "learning_rate": 1.9396135265700487e-06,
      "loss": 0.0012,
      "num_tokens": 1950164.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1500
    },
    {
      "completion_length": 10.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 10.475,
      "completions/mean_terminated_length": 10.475,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.10383716132581489,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02172171324491501,
      "kl": 1.461452579498291,
      "learning_rate": 1.9154589371980678e-06,
      "loss": 0.0015,
      "num_tokens": 1962247.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1510
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.7,
      "completions/max_terminated_length": 17.7,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.10452482464585339,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 4.611449241638184,
      "kl": 1.579625529050827,
      "learning_rate": 1.891304347826087e-06,
      "loss": 0.0016,
      "num_tokens": 1972028.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1520
    },
    {
      "completion_length": 20.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 50.9,
      "completions/max_terminated_length": 50.9,
      "completions/mean_length": 20.1,
      "completions/mean_terminated_length": 20.1,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.1052124879658919,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0037286400329321623,
      "kl": 1.2361969292163848,
      "learning_rate": 1.8671497584541066e-06,
      "loss": 0.0012,
      "num_tokens": 1985076.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1530
    },
    {
      "completion_length": 27.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 82.3,
      "completions/max_terminated_length": 82.3,
      "completions/mean_length": 27.475,
      "completions/mean_terminated_length": 27.475,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.1059001512859304,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 1.7765352725982666,
      "kl": 1.4994196772575379,
      "learning_rate": 1.8429951690821257e-06,
      "loss": 0.0015,
      "num_tokens": 1998887.0,
      "reward": 5.9,
      "reward_std": 0.2,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1540
    },
    {
      "completion_length": 10.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.325,
      "completions/mean_terminated_length": 10.325,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.10658781460596892,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013411914929747581,
      "kl": 1.5437786877155304,
      "learning_rate": 1.818840579710145e-06,
      "loss": 0.0015,
      "num_tokens": 2010696.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1550
    },
    {
      "completion_length": 13.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 24.4,
      "completions/max_terminated_length": 24.4,
      "completions/mean_length": 13.5,
      "completions/mean_terminated_length": 13.5,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.10727547792600743,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002627303823828697,
      "kl": 1.1799913942813873,
      "learning_rate": 1.7946859903381645e-06,
      "loss": 0.0012,
      "num_tokens": 2024560.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1560
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.10796314124604593,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034348671324551105,
      "kl": 1.3212086796760558,
      "learning_rate": 1.7705314009661836e-06,
      "loss": 0.0013,
      "num_tokens": 2037779.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1570
    },
    {
      "completion_length": 10.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.6,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 10.175,
      "completions/mean_terminated_length": 10.175,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.10865080456608445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008126153610646725,
      "kl": 1.4890945374965667,
      "learning_rate": 1.7463768115942031e-06,
      "loss": 0.0015,
      "num_tokens": 2049986.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1580
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.5,
      "completions/max_terminated_length": 16.5,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.10933846788612295,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003164671128615737,
      "kl": 1.8153117537498473,
      "learning_rate": 1.7222222222222224e-06,
      "loss": 0.0018,
      "num_tokens": 2061292.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1590
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.11002613120616146,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008705941960215569,
      "kl": 1.372535276412964,
      "learning_rate": 1.6980676328502415e-06,
      "loss": 0.0014,
      "num_tokens": 2072076.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1600
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.11071379452619998,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 3.7534122467041016,
      "kl": 1.3290419816970824,
      "learning_rate": 1.673913043478261e-06,
      "loss": 0.0013,
      "num_tokens": 2084146.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1610
    },
    {
      "completion_length": 10.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.8,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 10.725,
      "completions/mean_terminated_length": 10.725,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.11140145784623848,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007745617534965277,
      "kl": 1.4284833550453186,
      "learning_rate": 1.6497584541062803e-06,
      "loss": 0.0014,
      "num_tokens": 2094831.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1620
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.11208912116627699,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007192945573478937,
      "kl": 1.3611448645591735,
      "learning_rate": 1.6256038647342998e-06,
      "loss": 0.0014,
      "num_tokens": 2108805.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1630
    },
    {
      "completion_length": 54.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 184.4,
      "completions/max_terminated_length": 184.4,
      "completions/mean_length": 54.85,
      "completions/mean_terminated_length": 54.85,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.1127767844863155,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0031543634831905365,
      "kl": 1.1925908386707307,
      "learning_rate": 1.601449275362319e-06,
      "loss": 0.0012,
      "num_tokens": 2123923.0,
      "reward": 5.925,
      "reward_std": 0.10773502588272095,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.053867512941360475,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.053867512941360475,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1640
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.8,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.11346444780635401,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003872538451105356,
      "kl": 1.4870728313922883,
      "learning_rate": 1.5772946859903382e-06,
      "loss": 0.0015,
      "num_tokens": 2135622.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1650
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.11415211112639252,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006047138012945652,
      "kl": 1.2302622616291046,
      "learning_rate": 1.5531400966183577e-06,
      "loss": 0.0012,
      "num_tokens": 2148629.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1660
    },
    {
      "completion_length": 10.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 10.575,
      "completions/mean_terminated_length": 10.575,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.11483977444643102,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 3.129744291305542,
      "kl": 1.347231537103653,
      "learning_rate": 1.5289855072463768e-06,
      "loss": 0.0013,
      "num_tokens": 2157484.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1670
    },
    {
      "completion_length": 10.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.85,
      "completions/mean_terminated_length": 10.85,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.11552743776646954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005663453601300716,
      "kl": 1.3255208492279054,
      "learning_rate": 1.5048309178743963e-06,
      "loss": 0.0013,
      "num_tokens": 2172046.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1680
    },
    {
      "completion_length": 10.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.4,
      "completions/mean_terminated_length": 10.4,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.11621510108650805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007839322090148926,
      "kl": 1.516604733467102,
      "learning_rate": 1.4806763285024156e-06,
      "loss": 0.0015,
      "num_tokens": 2184446.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1690
    },
    {
      "completion_length": 11.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.5,
      "completions/mean_terminated_length": 11.5,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.11690276440654655,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0044697728008031845,
      "kl": 1.2774751663208008,
      "learning_rate": 1.4565217391304347e-06,
      "loss": 0.0013,
      "num_tokens": 2198042.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1700
    },
    {
      "completion_length": 10.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 10.25,
      "completions/mean_terminated_length": 10.25,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.11759042772658507,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008361663669347763,
      "kl": 1.3569441199302674,
      "learning_rate": 1.4323671497584543e-06,
      "loss": 0.0014,
      "num_tokens": 2210628.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1710
    },
    {
      "completion_length": 12.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.9,
      "completions/max_terminated_length": 19.9,
      "completions/mean_length": 12.8,
      "completions/mean_terminated_length": 12.8,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.11827809104662357,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.7055230140686035,
      "kl": 1.339830869436264,
      "learning_rate": 1.4082125603864736e-06,
      "loss": 0.0013,
      "num_tokens": 2223380.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1720
    },
    {
      "completion_length": 13.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.5,
      "completions/max_terminated_length": 19.5,
      "completions/mean_length": 13.175,
      "completions/mean_terminated_length": 13.175,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.11896575436666208,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005280562676489353,
      "kl": 1.2809099555015564,
      "learning_rate": 1.3840579710144926e-06,
      "loss": 0.0013,
      "num_tokens": 2235851.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1730
    },
    {
      "completion_length": 12.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 12.1,
      "completions/mean_terminated_length": 12.1,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.1196534176867006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004883504938334227,
      "kl": 1.0983420848846435,
      "learning_rate": 1.3599033816425122e-06,
      "loss": 0.0011,
      "num_tokens": 2248511.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1740
    },
    {
      "completion_length": 11.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.0,
      "completions/mean_terminated_length": 11.0,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.1203410810067391,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005267248023301363,
      "kl": 1.330749648809433,
      "learning_rate": 1.3357487922705315e-06,
      "loss": 0.0013,
      "num_tokens": 2259335.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1750
    },
    {
      "completion_length": 10.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 10.775,
      "completions/mean_terminated_length": 10.775,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.12102874432677761,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 5.972342491149902,
      "kl": 1.4707993149757386,
      "learning_rate": 1.311594202898551e-06,
      "loss": 0.0015,
      "num_tokens": 2273490.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1760
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.9,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.12171640764681611,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008044744841754436,
      "kl": 1.393112576007843,
      "learning_rate": 1.28743961352657e-06,
      "loss": 0.0014,
      "num_tokens": 2285606.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1770
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.12240407096685463,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036649706307798624,
      "kl": 1.138010984659195,
      "learning_rate": 1.2632850241545894e-06,
      "loss": 0.0011,
      "num_tokens": 2297982.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1780
    },
    {
      "completion_length": 9.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.3,
      "completions/max_terminated_length": 12.3,
      "completions/mean_length": 9.925,
      "completions/mean_terminated_length": 9.925,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.12309173428689314,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00226943870075047,
      "kl": 1.3314465701580047,
      "learning_rate": 1.2391304347826089e-06,
      "loss": 0.0013,
      "num_tokens": 2309427.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1790
    },
    {
      "completion_length": 20.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 52.6,
      "completions/max_terminated_length": 52.6,
      "completions/mean_length": 20.775,
      "completions/mean_terminated_length": 20.775,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.12377939760693164,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0048567261546850204,
      "kl": 1.1917468369007111,
      "learning_rate": 1.214975845410628e-06,
      "loss": 0.0012,
      "num_tokens": 2321950.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1800
    },
    {
      "completion_length": 10.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.8,
      "completions/max_terminated_length": 12.8,
      "completions/mean_length": 10.1,
      "completions/mean_terminated_length": 10.1,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.12446706092697016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004037019796669483,
      "kl": 1.2711975991725921,
      "learning_rate": 1.1908212560386475e-06,
      "loss": 0.0013,
      "num_tokens": 2333018.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1810
    },
    {
      "completion_length": 11.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 11.6,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.12515472424700866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004209557548165321,
      "kl": 1.2455637753009796,
      "learning_rate": 1.1666666666666668e-06,
      "loss": 0.0012,
      "num_tokens": 2345706.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1820
    },
    {
      "completion_length": 10.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 10.9,
      "completions/mean_terminated_length": 10.9,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.1258423875670472,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004164872225373983,
      "kl": 1.3472298622131347,
      "learning_rate": 1.142512077294686e-06,
      "loss": 0.0013,
      "num_tokens": 2357826.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1830
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.1265300508870857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010473374277353287,
      "kl": 1.3082951486110688,
      "learning_rate": 1.1183574879227054e-06,
      "loss": 0.0013,
      "num_tokens": 2369667.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1840
    },
    {
      "completion_length": 11.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 11.05,
      "completions/mean_terminated_length": 11.05,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.1272177142071242,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004580930341035128,
      "kl": 1.3786839723587037,
      "learning_rate": 1.0942028985507247e-06,
      "loss": 0.0014,
      "num_tokens": 2381753.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1850
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.9,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.1279053775271627,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00417534913867712,
      "kl": 1.2470466375350953,
      "learning_rate": 1.070048309178744e-06,
      "loss": 0.0012,
      "num_tokens": 2392604.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1860
    },
    {
      "completion_length": 11.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.1,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.12859304084720122,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006656737998127937,
      "kl": 1.3809801578521728,
      "learning_rate": 1.0458937198067635e-06,
      "loss": 0.0014,
      "num_tokens": 2404688.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1870
    },
    {
      "completion_length": 23.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 64.4,
      "completions/max_terminated_length": 64.4,
      "completions/mean_length": 23.325,
      "completions/mean_terminated_length": 23.325,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.12928070416723972,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0029542380943894386,
      "kl": 1.4932059407234193,
      "learning_rate": 1.0217391304347828e-06,
      "loss": 0.0015,
      "num_tokens": 2416525.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1880
    },
    {
      "completion_length": 11.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.075,
      "completions/mean_terminated_length": 11.075,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.12996836748727822,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004283738322556019,
      "kl": 1.4605644285678863,
      "learning_rate": 9.97584541062802e-07,
      "loss": 0.0015,
      "num_tokens": 2428684.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1890
    },
    {
      "completion_length": 10.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 10.925,
      "completions/mean_terminated_length": 10.925,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.13065603080731675,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004885092377662659,
      "kl": 1.2866985321044921,
      "learning_rate": 9.734299516908214e-07,
      "loss": 0.0013,
      "num_tokens": 2440733.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1900
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.13134369412735525,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006722769699990749,
      "kl": 1.2210811614990233,
      "learning_rate": 9.492753623188407e-07,
      "loss": 0.0012,
      "num_tokens": 2451929.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1910
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.13203135744739375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0048691569827497005,
      "kl": 1.236713171005249,
      "learning_rate": 9.2512077294686e-07,
      "loss": 0.0012,
      "num_tokens": 2462035.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1920
    },
    {
      "completion_length": 10.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 10.6,
      "completions/mean_terminated_length": 10.6,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.13271902076743228,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021696312353014946,
      "kl": 1.2257360517978668,
      "learning_rate": 9.009661835748792e-07,
      "loss": 0.0012,
      "num_tokens": 2473739.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1930
    },
    {
      "completion_length": 11.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.2,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.13340668408747078,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032455010805279016,
      "kl": 1.325300359725952,
      "learning_rate": 8.768115942028986e-07,
      "loss": 0.0013,
      "num_tokens": 2485935.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1940
    },
    {
      "completion_length": 10.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.775,
      "completions/mean_terminated_length": 10.775,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.13409434740750928,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002819440560415387,
      "kl": 1.2611912727355956,
      "learning_rate": 8.526570048309179e-07,
      "loss": 0.0013,
      "num_tokens": 2498274.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1950
    },
    {
      "completion_length": 9.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 9.65,
      "completions/mean_terminated_length": 9.65,
      "completions/min_length": 7.3,
      "completions/min_terminated_length": 7.3,
      "epoch": 0.13478201072754778,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0049956305883824825,
      "kl": 1.4177262127399444,
      "learning_rate": 8.285024154589373e-07,
      "loss": 0.0014,
      "num_tokens": 2510556.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1960
    },
    {
      "completion_length": 11.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.2,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.1354696740475863,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032518147490918636,
      "kl": 1.2991495847702026,
      "learning_rate": 8.043478260869565e-07,
      "loss": 0.0013,
      "num_tokens": 2525016.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1970
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.1361573373676248,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00288589671254158,
      "kl": 1.1318634927272797,
      "learning_rate": 7.801932367149758e-07,
      "loss": 0.0011,
      "num_tokens": 2538403.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1980
    },
    {
      "completion_length": 22.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 57.1,
      "completions/max_terminated_length": 57.1,
      "completions/mean_length": 22.075,
      "completions/mean_terminated_length": 22.075,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.1368450006876633,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 1.648767113685608,
      "kl": 1.2390470504760742,
      "learning_rate": 7.560386473429952e-07,
      "loss": 0.0012,
      "num_tokens": 2551406.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 1990
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.13753266400770184,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01037998590618372,
      "kl": 1.352639377117157,
      "learning_rate": 7.318840579710145e-07,
      "loss": 0.0014,
      "num_tokens": 2562439.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2000
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.13822032732774034,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003507354063913226,
      "kl": 1.4222690105438232,
      "learning_rate": 7.07729468599034e-07,
      "loss": 0.0014,
      "num_tokens": 2575488.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2010
    },
    {
      "completion_length": 22.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 57.5,
      "completions/max_terminated_length": 57.5,
      "completions/mean_length": 22.575,
      "completions/mean_terminated_length": 22.575,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.13890799064777884,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.026074331253767014,
      "kl": 1.262561959028244,
      "learning_rate": 6.835748792270532e-07,
      "loss": 0.0013,
      "num_tokens": 2588159.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2020
    },
    {
      "completion_length": 10.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 10.575,
      "completions/mean_terminated_length": 10.575,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.13959565396781737,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00410475442185998,
      "kl": 1.2806087374687194,
      "learning_rate": 6.594202898550725e-07,
      "loss": 0.0013,
      "num_tokens": 2599894.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2030
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.5,
      "completions/max_terminated_length": 16.5,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.14028331728785587,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015047146007418633,
      "kl": 21.605943036079406,
      "learning_rate": 6.352657004830919e-07,
      "loss": 0.0216,
      "num_tokens": 2613991.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2040
    },
    {
      "completion_length": 10.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.65,
      "completions/mean_terminated_length": 10.65,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.14097098060789437,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0043119946494698524,
      "kl": 1.3894463539123536,
      "learning_rate": 6.111111111111112e-07,
      "loss": 0.0014,
      "num_tokens": 2627701.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2050
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.14165864392793287,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0072251055389642715,
      "kl": 1.2303254783153534,
      "learning_rate": 5.869565217391305e-07,
      "loss": 0.0012,
      "num_tokens": 2639680.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2060
    },
    {
      "completion_length": 12.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.5,
      "completions/max_terminated_length": 19.5,
      "completions/mean_length": 12.5,
      "completions/mean_terminated_length": 12.5,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.1423463072479714,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004525719676166773,
      "kl": 1.2622820734977722,
      "learning_rate": 5.628019323671498e-07,
      "loss": 0.0013,
      "num_tokens": 2652124.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2070
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.1430339705680099,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004848666954785585,
      "kl": 1.3604639172554016,
      "learning_rate": 5.386473429951692e-07,
      "loss": 0.0014,
      "num_tokens": 2664773.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2080
    },
    {
      "completion_length": 10.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 10.625,
      "completions/mean_terminated_length": 10.625,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.1437216338880484,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003022987162694335,
      "kl": 1.539460152387619,
      "learning_rate": 5.144927536231884e-07,
      "loss": 0.0015,
      "num_tokens": 2677914.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2090
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.14440929720808693,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003124531824141741,
      "kl": 1.2718017101287842,
      "learning_rate": 4.903381642512078e-07,
      "loss": 0.0013,
      "num_tokens": 2690576.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2100
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.14509696052812543,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002378766192123294,
      "kl": 1.2865601122379302,
      "learning_rate": 4.6618357487922714e-07,
      "loss": 0.0013,
      "num_tokens": 2702989.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2110
    },
    {
      "completion_length": 10.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.75,
      "completions/mean_terminated_length": 10.75,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.14578462384816393,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004591071512550116,
      "kl": 1.3285470128059387,
      "learning_rate": 4.420289855072464e-07,
      "loss": 0.0013,
      "num_tokens": 2715139.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2120
    },
    {
      "completion_length": 10.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 10.675,
      "completions/mean_terminated_length": 10.675,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.14647228716820246,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032588632311671972,
      "kl": 1.2768325805664062,
      "learning_rate": 4.1787439613526574e-07,
      "loss": 0.0013,
      "num_tokens": 2728638.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2130
    },
    {
      "completion_length": 12.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.1,
      "completions/max_terminated_length": 18.1,
      "completions/mean_length": 12.1,
      "completions/mean_terminated_length": 12.1,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.14715995048824096,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008527855388820171,
      "kl": 1.3150019347667694,
      "learning_rate": 3.9371980676328504e-07,
      "loss": 0.0013,
      "num_tokens": 2742582.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2140
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.14784761380827946,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005529398564249277,
      "kl": 1.3183214008808135,
      "learning_rate": 3.695652173913044e-07,
      "loss": 0.0013,
      "num_tokens": 2753762.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2150
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.148535277128318,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021252967417240143,
      "kl": 1.1668545484542847,
      "learning_rate": 3.454106280193237e-07,
      "loss": 0.0012,
      "num_tokens": 2767079.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2160
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.1492229404483565,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015458406880497932,
      "kl": 1.3273634731769561,
      "learning_rate": 3.2125603864734306e-07,
      "loss": 0.0013,
      "num_tokens": 2780155.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2170
    },
    {
      "completion_length": 13.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.7,
      "completions/max_terminated_length": 20.7,
      "completions/mean_length": 13.45,
      "completions/mean_terminated_length": 13.45,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.149910603768395,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0029216071125119925,
      "kl": 2.837045794725418,
      "learning_rate": 2.9710144927536236e-07,
      "loss": 0.0028,
      "num_tokens": 2793497.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2180
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.1505982670884335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004856933373957872,
      "kl": 1.2759633779525756,
      "learning_rate": 2.7294685990338166e-07,
      "loss": 0.0013,
      "num_tokens": 2805209.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2190
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.15128593040847202,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005369469057768583,
      "kl": 1.2020570576190948,
      "learning_rate": 2.4879227053140096e-07,
      "loss": 0.0012,
      "num_tokens": 2819046.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2200
    },
    {
      "completion_length": 10.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 10.075,
      "completions/mean_terminated_length": 10.075,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.15197359372851052,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032669377978891134,
      "kl": 1.4382840514183044,
      "learning_rate": 2.2463768115942032e-07,
      "loss": 0.0014,
      "num_tokens": 2830509.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2210
    },
    {
      "completion_length": 10.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.575,
      "completions/mean_terminated_length": 10.575,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.15266125704854902,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011143822222948074,
      "kl": 1.4830769181251526,
      "learning_rate": 2.0048309178743962e-07,
      "loss": 0.0015,
      "num_tokens": 2842048.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2220
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.15334892036858755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004490590654313564,
      "kl": 1.174393892288208,
      "learning_rate": 1.7632850241545895e-07,
      "loss": 0.0012,
      "num_tokens": 2854086.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2230
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.15403658368862605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004963410086929798,
      "kl": 1.3155929028987885,
      "learning_rate": 1.5217391304347828e-07,
      "loss": 0.0013,
      "num_tokens": 2867383.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2240
    },
    {
      "completion_length": 11.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.15,
      "completions/mean_terminated_length": 11.15,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.15472424700866455,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004094480536878109,
      "kl": 1.2090741574764252,
      "learning_rate": 1.280193236714976e-07,
      "loss": 0.0012,
      "num_tokens": 2878613.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2250
    },
    {
      "completion_length": 41.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 134.7,
      "completions/max_terminated_length": 134.7,
      "completions/mean_length": 41.475,
      "completions/mean_terminated_length": 41.475,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.15541191032870308,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.008804809302091599,
      "kl": 1.232531774044037,
      "learning_rate": 1.0386473429951691e-07,
      "loss": 0.0012,
      "num_tokens": 2891968.0,
      "reward": 5.8875,
      "reward_std": 0.225,
      "rewards/check_coherence/mean": 1.4625,
      "rewards/check_coherence/std": 0.075,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2260
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.15609957364874158,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023921902757138014,
      "kl": 1.363280749320984,
      "learning_rate": 7.971014492753624e-08,
      "loss": 0.0014,
      "num_tokens": 2905756.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2270
    },
    {
      "completion_length": 15.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 30.2,
      "completions/max_terminated_length": 30.2,
      "completions/mean_length": 15.475,
      "completions/mean_terminated_length": 15.475,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.15678723696878008,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0035021391231566668,
      "kl": 1.1597087323665618,
      "learning_rate": 5.555555555555556e-08,
      "loss": 0.0012,
      "num_tokens": 2917635.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2280
    },
    {
      "completion_length": 13.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 24.6,
      "completions/max_terminated_length": 24.6,
      "completions/mean_length": 13.9,
      "completions/mean_terminated_length": 13.9,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.15747490028881858,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0036871584597975016,
      "kl": 1.1678650498390197,
      "learning_rate": 3.140096618357488e-08,
      "loss": 0.0012,
      "num_tokens": 2931199.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2290
    },
    {
      "completion_length": 50.375,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 170.5,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 50.375,
      "completions/mean_terminated_length": 11.13333339691162,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.1581625636088571,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004820572212338448,
      "kl": 1.4776630043983459,
      "learning_rate": 7.246376811594204e-09,
      "loss": 0.0015,
      "num_tokens": 2943782.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2300
    },
    {
      "completion_length": 19.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 48.0,
      "completions/max_terminated_length": 48.0,
      "completions/mean_length": 19.6,
      "completions/mean_terminated_length": 19.6,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.1588502269288956,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 1.7149800062179565,
      "kl": 1.1576267778873444,
      "learning_rate": 4.115480649188515e-06,
      "loss": 0.0012,
      "num_tokens": 2956230.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2310
    },
    {
      "completion_length": 11.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.2,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.1595378902489341,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003225781721994281,
      "kl": 1.2905008971691132,
      "learning_rate": 4.109238451935081e-06,
      "loss": 0.0013,
      "num_tokens": 2966358.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2320
    },
    {
      "completion_length": 11.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.0,
      "completions/mean_terminated_length": 11.0,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.16022555356897264,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004192621912807226,
      "kl": 1.300399947166443,
      "learning_rate": 4.102996254681649e-06,
      "loss": 0.0013,
      "num_tokens": 2978250.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2330
    },
    {
      "completion_length": 11.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.05,
      "completions/mean_terminated_length": 11.05,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.16091321688901114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0060669067315757275,
      "kl": 1.0718920350074768,
      "learning_rate": 4.096754057428215e-06,
      "loss": 0.0011,
      "num_tokens": 2990024.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2340
    },
    {
      "completion_length": 10.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 10.55,
      "completions/mean_terminated_length": 10.55,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.16160088020904964,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020677910652011633,
      "kl": 1.2670764803886414,
      "learning_rate": 4.090511860174782e-06,
      "loss": 0.0013,
      "num_tokens": 3002978.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2350
    },
    {
      "completion_length": 10.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 11.6,
      "completions/max_terminated_length": 11.6,
      "completions/mean_length": 10.05,
      "completions/mean_terminated_length": 10.05,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.16228854352908817,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005457987543195486,
      "kl": 1.3371248841285706,
      "learning_rate": 4.0842696629213485e-06,
      "loss": 0.0013,
      "num_tokens": 3016648.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2360
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.16297620684912667,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008317215833812952,
      "kl": 1.2839708745479583,
      "learning_rate": 4.078027465667916e-06,
      "loss": 0.0013,
      "num_tokens": 3026820.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2370
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.16366387016916517,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004411312751471996,
      "kl": 1.2436644613742829,
      "learning_rate": 4.071785268414482e-06,
      "loss": 0.0012,
      "num_tokens": 3037648.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2380
    },
    {
      "completion_length": 10.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 10.55,
      "completions/mean_terminated_length": 10.55,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.1643515334892037,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036025703884661198,
      "kl": 1.3522222876548766,
      "learning_rate": 4.0655430711610484e-06,
      "loss": 0.0014,
      "num_tokens": 3049514.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2390
    },
    {
      "completion_length": 10.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.75,
      "completions/mean_terminated_length": 10.75,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.1650391968092422,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005733081139624119,
      "kl": 1.279357409477234,
      "learning_rate": 4.059300873907616e-06,
      "loss": 0.0013,
      "num_tokens": 3061816.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2400
    },
    {
      "completion_length": 11.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.95,
      "completions/mean_terminated_length": 11.95,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.1657268601292807,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027056564576923847,
      "kl": 3.6889904975891112,
      "learning_rate": 4.053058676654182e-06,
      "loss": 0.0037,
      "num_tokens": 3075390.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2410
    },
    {
      "completion_length": 64.775,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 226.5,
      "completions/max_terminated_length": 70.1,
      "completions/mean_length": 64.775,
      "completions/mean_terminated_length": 25.483333396911622,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.1664145234493192,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 3.775110960006714,
      "kl": 1.2242095589637756,
      "learning_rate": 4.046816479400749e-06,
      "loss": 0.0012,
      "num_tokens": 3090241.0,
      "reward": 5.8875,
      "reward_std": 0.225,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.1,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2420
    },
    {
      "completion_length": 30.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 94.8,
      "completions/max_terminated_length": 94.8,
      "completions/mean_length": 30.7,
      "completions/mean_terminated_length": 30.7,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.16710218676935773,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008342845365405083,
      "kl": 1.3111794650554658,
      "learning_rate": 4.0405742821473155e-06,
      "loss": 0.0013,
      "num_tokens": 3103937.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2430
    },
    {
      "completion_length": 11.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.45,
      "completions/mean_terminated_length": 11.45,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.16778985008939623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006231814622879028,
      "kl": 1.4186553835868836,
      "learning_rate": 4.034332084893883e-06,
      "loss": 0.0014,
      "num_tokens": 3116879.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2440
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.16847751340943473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00336334272287786,
      "kl": 1.346349060535431,
      "learning_rate": 4.028089887640449e-06,
      "loss": 0.0013,
      "num_tokens": 3129722.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2450
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.16916517672947326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003932368475943804,
      "kl": 1.185975819826126,
      "learning_rate": 4.021847690387017e-06,
      "loss": 0.0012,
      "num_tokens": 3141045.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2460
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.16985284004951176,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002778848400339484,
      "kl": 1.2687766671180725,
      "learning_rate": 4.0156054931335835e-06,
      "loss": 0.0013,
      "num_tokens": 3155076.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2470
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.17054050336955026,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002489407081156969,
      "kl": 1.169280767440796,
      "learning_rate": 4.009363295880151e-06,
      "loss": 0.0012,
      "num_tokens": 3167025.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2480
    },
    {
      "completion_length": 18.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 44.3,
      "completions/max_terminated_length": 44.3,
      "completions/mean_length": 18.075,
      "completions/mean_terminated_length": 18.075,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.1712281666895888,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.010691997595131397,
      "kl": 1.4809851229190827,
      "learning_rate": 4.003121098626717e-06,
      "loss": 0.0015,
      "num_tokens": 3177980.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2490
    },
    {
      "completion_length": 14.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.8,
      "completions/max_terminated_length": 31.8,
      "completions/mean_length": 14.9,
      "completions/mean_terminated_length": 14.9,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.1719158300096273,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.002949868328869343,
      "kl": 1.39860680103302,
      "learning_rate": 3.2043314500941624e-06,
      "loss": 0.0014,
      "num_tokens": 3189244.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2500
    },
    {
      "completion_length": 10.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.625,
      "completions/mean_terminated_length": 10.625,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.1726034933296658,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010366762056946754,
      "kl": 1.3766119718551635,
      "learning_rate": 3.1949152542372884e-06,
      "loss": 0.0014,
      "num_tokens": 3202885.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2510
    },
    {
      "completion_length": 10.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.5,
      "completions/max_terminated_length": 12.5,
      "completions/mean_length": 10.275,
      "completions/mean_terminated_length": 10.275,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.1732911566497043,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029948921874165535,
      "kl": 1.4934549808502198,
      "learning_rate": 3.1854990583804148e-06,
      "loss": 0.0015,
      "num_tokens": 3214960.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2520
    },
    {
      "completion_length": 11.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.15,
      "completions/mean_terminated_length": 11.15,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.17397881996974282,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0014775346498936415,
      "kl": 1.3120596766471864,
      "learning_rate": 3.1760828625235407e-06,
      "loss": 0.0013,
      "num_tokens": 3225346.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2530
    },
    {
      "completion_length": 10.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.675,
      "completions/mean_terminated_length": 10.675,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.17466648328978132,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037268514279276133,
      "kl": 1.3505820691585542,
      "learning_rate": 3.1666666666666667e-06,
      "loss": 0.0014,
      "num_tokens": 3237365.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2540
    },
    {
      "completion_length": 10.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.6,
      "completions/mean_terminated_length": 10.6,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.17535414660981982,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002531415317207575,
      "kl": 1.456881034374237,
      "learning_rate": 3.1572504708097927e-06,
      "loss": 0.0015,
      "num_tokens": 3251221.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2550
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.17604180992985835,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009061750024557114,
      "kl": 1.244165402650833,
      "learning_rate": 3.1478342749529195e-06,
      "loss": 0.0012,
      "num_tokens": 3263818.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2560
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.17672947324989685,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00531205628067255,
      "kl": 1.3328250467777252,
      "learning_rate": 3.1384180790960454e-06,
      "loss": 0.0013,
      "num_tokens": 3276140.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2570
    },
    {
      "completion_length": 10.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 11.5,
      "completions/max_terminated_length": 11.5,
      "completions/mean_length": 10.15,
      "completions/mean_terminated_length": 10.15,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.17741713656993535,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 11.290862083435059,
      "kl": 1.4745511174201966,
      "learning_rate": 3.129001883239172e-06,
      "loss": 0.0015,
      "num_tokens": 3288702.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2580
    },
    {
      "completion_length": 10.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.8,
      "completions/max_terminated_length": 12.8,
      "completions/mean_length": 10.475,
      "completions/mean_terminated_length": 10.475,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.17810479988997388,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004790943581610918,
      "kl": 1.3869511127471923,
      "learning_rate": 3.1195856873822978e-06,
      "loss": 0.0014,
      "num_tokens": 3301205.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2590
    },
    {
      "completion_length": 10.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 10.85,
      "completions/mean_terminated_length": 10.85,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.17879246321001238,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015610884875059128,
      "kl": 1.197289276123047,
      "learning_rate": 3.1101694915254237e-06,
      "loss": 0.0012,
      "num_tokens": 3313747.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2600
    },
    {
      "completion_length": 10.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 10.675,
      "completions/mean_terminated_length": 10.675,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.17948012653005088,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018645133823156357,
      "kl": 1.4583391726016999,
      "learning_rate": 3.1007532956685505e-06,
      "loss": 0.0015,
      "num_tokens": 3325470.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2610
    },
    {
      "completion_length": 16.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 37.0,
      "completions/max_terminated_length": 37.0,
      "completions/mean_length": 16.625,
      "completions/mean_terminated_length": 16.625,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.18016778985008938,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004674810450524092,
      "kl": 1.2932902693748474,
      "learning_rate": 3.0913370998116765e-06,
      "loss": 0.0013,
      "num_tokens": 3337531.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2620
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.1808554531701279,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030683784279972315,
      "kl": 1.154135423898697,
      "learning_rate": 3.0819209039548024e-06,
      "loss": 0.0012,
      "num_tokens": 3350250.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2630
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.1815431164901664,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005663893185555935,
      "kl": 1.3627763926982879,
      "learning_rate": 3.072504708097929e-06,
      "loss": 0.0014,
      "num_tokens": 3362334.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2640
    },
    {
      "completion_length": 11.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 11.0,
      "completions/mean_terminated_length": 11.0,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.1822307798102049,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004335007164627314,
      "kl": 1.2993446350097657,
      "learning_rate": 3.0630885122410548e-06,
      "loss": 0.0013,
      "num_tokens": 3376026.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2650
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.18291844313024344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010090747848153114,
      "kl": 1.2555712342262269,
      "learning_rate": 3.0536723163841807e-06,
      "loss": 0.0013,
      "num_tokens": 3388197.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2660
    },
    {
      "completion_length": 10.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.5,
      "completions/max_terminated_length": 12.5,
      "completions/mean_length": 10.475,
      "completions/mean_terminated_length": 10.475,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.18360610645028194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005461522843688726,
      "kl": 1.423725974559784,
      "learning_rate": 3.0442561205273075e-06,
      "loss": 0.0014,
      "num_tokens": 3399652.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2670
    },
    {
      "completion_length": 10.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.425,
      "completions/mean_terminated_length": 10.425,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.18429376977032044,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002112050075083971,
      "kl": 1.3453264117240906,
      "learning_rate": 3.0348399246704335e-06,
      "loss": 0.0013,
      "num_tokens": 3411169.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2680
    },
    {
      "completion_length": 11.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.2,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.18498143309035897,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004399535246193409,
      "kl": 1.5060070991516112,
      "learning_rate": 3.0254237288135594e-06,
      "loss": 0.0015,
      "num_tokens": 3423749.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2690
    },
    {
      "completion_length": 10.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.5,
      "completions/mean_terminated_length": 10.5,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.18566909641039747,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005135955289006233,
      "kl": 1.2228712022304535,
      "learning_rate": 3.016007532956686e-06,
      "loss": 0.0012,
      "num_tokens": 3434905.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2700
    },
    {
      "completion_length": 10.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 10.625,
      "completions/mean_terminated_length": 10.625,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.18635675973043597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008232400752604008,
      "kl": 1.2177106857299804,
      "learning_rate": 3.0065913370998118e-06,
      "loss": 0.0012,
      "num_tokens": 3446182.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2710
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.1870444230504745,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0071622999384999275,
      "kl": 1.387217903137207,
      "learning_rate": 2.9971751412429377e-06,
      "loss": 0.0014,
      "num_tokens": 3457733.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2720
    },
    {
      "completion_length": 10.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.675,
      "completions/mean_terminated_length": 10.675,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.187732086370513,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003097102278843522,
      "kl": 1.302090060710907,
      "learning_rate": 2.9877589453860645e-06,
      "loss": 0.0013,
      "num_tokens": 3470068.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2730
    },
    {
      "completion_length": 10.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 10.825,
      "completions/mean_terminated_length": 10.825,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.1884197496905515,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003183470806106925,
      "kl": 1.4097931861877442,
      "learning_rate": 2.9783427495291905e-06,
      "loss": 0.0014,
      "num_tokens": 3480493.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2740
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.18910741301059,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004561976063996553,
      "kl": 1.1230164349079133,
      "learning_rate": 2.9689265536723165e-06,
      "loss": 0.0011,
      "num_tokens": 3490128.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2750
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.18979507633062853,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005841500591486692,
      "kl": 1.239791786670685,
      "learning_rate": 2.959510357815443e-06,
      "loss": 0.0012,
      "num_tokens": 3501943.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2760
    },
    {
      "completion_length": 11.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.9,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 11.125,
      "completions/mean_terminated_length": 11.125,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.19048273965066703,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.018330121412873268,
      "kl": 2.2330337703227996,
      "learning_rate": 2.950094161958569e-06,
      "loss": 0.0022,
      "num_tokens": 3514268.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2770
    },
    {
      "completion_length": 10.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.575,
      "completions/mean_terminated_length": 10.575,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.19117040297070553,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025839856825768948,
      "kl": 1.3854608416557312,
      "learning_rate": 2.9406779661016956e-06,
      "loss": 0.0014,
      "num_tokens": 3525975.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2780
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.19185806629074406,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035424588713794947,
      "kl": 1.3310083508491517,
      "learning_rate": 2.9312617702448216e-06,
      "loss": 0.0013,
      "num_tokens": 3538256.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2790
    },
    {
      "completion_length": 10.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.425,
      "completions/mean_terminated_length": 10.425,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.19254572961078256,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0014532285276800394,
      "kl": 1.4109968423843384,
      "learning_rate": 2.9218455743879475e-06,
      "loss": 0.0014,
      "num_tokens": 3549189.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2800
    },
    {
      "completion_length": 12.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.5,
      "completions/max_terminated_length": 20.5,
      "completions/mean_length": 12.45,
      "completions/mean_terminated_length": 12.45,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.19323339293082106,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.004302640911191702,
      "kl": 1.2859423279762268,
      "learning_rate": 2.9124293785310735e-06,
      "loss": 0.0013,
      "num_tokens": 3561783.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2810
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.1939210562508596,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007680397480726242,
      "kl": 1.4603121995925903,
      "learning_rate": 2.9030131826742e-06,
      "loss": 0.0015,
      "num_tokens": 3574500.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2820
    },
    {
      "completion_length": 10.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 10.775,
      "completions/mean_terminated_length": 10.775,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.1946087195708981,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020181452855467796,
      "kl": 1.3944905817508697,
      "learning_rate": 2.893596986817326e-06,
      "loss": 0.0014,
      "num_tokens": 3587815.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2830
    },
    {
      "completion_length": 11.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.1,
      "completions/max_terminated_length": 17.1,
      "completions/mean_length": 11.1,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.1952963828909366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006148161832243204,
      "kl": 1.2499525606632234,
      "learning_rate": 2.8841807909604526e-06,
      "loss": 0.0012,
      "num_tokens": 3599027.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2840
    },
    {
      "completion_length": 11.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.1,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.1959840462109751,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0890081599354744,
      "kl": 1.3890787661075592,
      "learning_rate": 2.8747645951035786e-06,
      "loss": 0.0014,
      "num_tokens": 3611155.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2850
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.19667170953101362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004152151755988598,
      "kl": 1.1795121252536773,
      "learning_rate": 2.8653483992467045e-06,
      "loss": 0.0012,
      "num_tokens": 3622457.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2860
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.19735937285105212,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033720259089022875,
      "kl": 1.2966312885284423,
      "learning_rate": 2.8559322033898305e-06,
      "loss": 0.0013,
      "num_tokens": 3635224.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2870
    },
    {
      "completion_length": 10.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 10.65,
      "completions/mean_terminated_length": 10.65,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.19804703617109062,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.02166939526796341,
      "kl": 1.1902290284633636,
      "learning_rate": 2.846516007532957e-06,
      "loss": 0.0012,
      "num_tokens": 3648742.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2880
    },
    {
      "completion_length": 17.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 35.0,
      "completions/max_terminated_length": 35.0,
      "completions/mean_length": 17.05,
      "completions/mean_terminated_length": 17.05,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.19873469949112915,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.003716163570061326,
      "kl": 1.2905597269535065,
      "learning_rate": 2.837099811676083e-06,
      "loss": 0.0013,
      "num_tokens": 3662064.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2890
    },
    {
      "completion_length": 20.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.5,
      "completions/max_terminated_length": 49.5,
      "completions/mean_length": 20.15,
      "completions/mean_terminated_length": 20.15,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.19942236281116765,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0016892965650185943,
      "kl": 1.356507009267807,
      "learning_rate": 2.8276836158192096e-06,
      "loss": 0.0014,
      "num_tokens": 3676226.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2900
    },
    {
      "completion_length": 10.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.2,
      "completions/max_terminated_length": 12.2,
      "completions/mean_length": 10.425,
      "completions/mean_terminated_length": 10.425,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.20011002613120615,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007044503930956125,
      "kl": 1.2502503633499145,
      "learning_rate": 2.8182674199623356e-06,
      "loss": 0.0013,
      "num_tokens": 3688579.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2910
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.20079768945124468,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026405714452266693,
      "kl": 1.3286872804164886,
      "learning_rate": 2.8088512241054615e-06,
      "loss": 0.0013,
      "num_tokens": 3702350.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2920
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.2,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.20148535277128318,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002148184459656477,
      "kl": 1.135231328010559,
      "learning_rate": 2.799435028248588e-06,
      "loss": 0.0011,
      "num_tokens": 3713775.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2930
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.4,
      "completions/max_terminated_length": 12.4,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.20217301609132168,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004673804622143507,
      "kl": 1.3920660734176635,
      "learning_rate": 2.790018832391714e-06,
      "loss": 0.0014,
      "num_tokens": 3725491.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2940
    },
    {
      "completion_length": 12.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 12.475,
      "completions/mean_terminated_length": 12.475,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.2028606794113602,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004096169024705887,
      "kl": 1.3312494993209838,
      "learning_rate": 2.7806026365348403e-06,
      "loss": 0.0013,
      "num_tokens": 3738282.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2950
    },
    {
      "completion_length": 12.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.4,
      "completions/max_terminated_length": 19.4,
      "completions/mean_length": 12.775,
      "completions/mean_terminated_length": 12.775,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.2035483427313987,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00869634561240673,
      "kl": 1.067715847492218,
      "learning_rate": 2.7711864406779666e-06,
      "loss": 0.0011,
      "num_tokens": 3752477.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2960
    },
    {
      "completion_length": 13.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 13.55,
      "completions/mean_terminated_length": 13.55,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.2042360060514372,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003822652855888009,
      "kl": 2.798942339420319,
      "learning_rate": 2.7617702448210926e-06,
      "loss": 0.0028,
      "num_tokens": 3765251.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2970
    },
    {
      "completion_length": 11.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.325,
      "completions/mean_terminated_length": 11.325,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.20492366937147571,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003919885028153658,
      "kl": 1.5202113151550294,
      "learning_rate": 2.7523540489642185e-06,
      "loss": 0.0015,
      "num_tokens": 3778692.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2980
    },
    {
      "completion_length": 13.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.3,
      "completions/max_terminated_length": 19.3,
      "completions/mean_length": 13.125,
      "completions/mean_terminated_length": 13.125,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.20561133269151424,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.10944122821092606,
      "kl": 1.2642498075962068,
      "learning_rate": 2.742937853107345e-06,
      "loss": 0.0013,
      "num_tokens": 3791293.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 2990
    },
    {
      "completion_length": 13.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.3,
      "completions/max_terminated_length": 21.3,
      "completions/mean_length": 13.525,
      "completions/mean_terminated_length": 13.525,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.20629899601155274,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006753021385520697,
      "kl": 1.4397485315799714,
      "learning_rate": 2.733521657250471e-06,
      "loss": 0.0014,
      "num_tokens": 3803098.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3000
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.20698665933159124,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026066696736961603,
      "kl": 1.0516700327396393,
      "learning_rate": 2.7241054613935973e-06,
      "loss": 0.0011,
      "num_tokens": 3815979.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3010
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.20767432265162977,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022212343756109476,
      "kl": 1.3609875798225404,
      "learning_rate": 2.7146892655367236e-06,
      "loss": 0.0014,
      "num_tokens": 3828709.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3020
    },
    {
      "completion_length": 15.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 28.1,
      "completions/max_terminated_length": 28.1,
      "completions/mean_length": 15.95,
      "completions/mean_terminated_length": 15.95,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.20836198597166827,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025453646667301655,
      "kl": 1.0688377380371095,
      "learning_rate": 2.7052730696798496e-06,
      "loss": 0.0011,
      "num_tokens": 3839227.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3030
    },
    {
      "completion_length": 12.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.9,
      "completions/max_terminated_length": 20.9,
      "completions/mean_length": 12.575,
      "completions/mean_terminated_length": 12.575,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.20904964929170677,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0028345719911158085,
      "kl": 1.3270336389541626,
      "learning_rate": 2.6958568738229756e-06,
      "loss": 0.0013,
      "num_tokens": 3851634.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3040
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.2097373126117453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004961980972439051,
      "kl": 1.3371050000190734,
      "learning_rate": 2.686440677966102e-06,
      "loss": 0.0013,
      "num_tokens": 3865056.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3050
    },
    {
      "completion_length": 11.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.1,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.2104249759317838,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003095800057053566,
      "kl": 1.2146135807037353,
      "learning_rate": 2.677024482109228e-06,
      "loss": 0.0012,
      "num_tokens": 3876756.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3060
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.2111126392518223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020866338163614273,
      "kl": 1.2059853732585908,
      "learning_rate": 2.6676082862523543e-06,
      "loss": 0.0012,
      "num_tokens": 3891107.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3070
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.2118003025718608,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037311518099159002,
      "kl": 1.2837676525115966,
      "learning_rate": 2.6581920903954807e-06,
      "loss": 0.0013,
      "num_tokens": 3903030.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3080
    },
    {
      "completion_length": 12.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 12.2,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.21248796589189933,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004318287596106529,
      "kl": 1.1657697916030885,
      "learning_rate": 2.6487758945386066e-06,
      "loss": 0.0012,
      "num_tokens": 3916166.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3090
    },
    {
      "completion_length": 11.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.21317562921193783,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004012678749859333,
      "kl": 1.1232018172740936,
      "learning_rate": 2.6393596986817326e-06,
      "loss": 0.0011,
      "num_tokens": 3927217.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3100
    },
    {
      "completion_length": 12.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.6,
      "completions/max_terminated_length": 17.6,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.21386329253197633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.13298767805099487,
      "kl": 1.5185560762882233,
      "learning_rate": 2.629943502824859e-06,
      "loss": 0.0015,
      "num_tokens": 3938505.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3110
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.21455095585201486,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006646719295531511,
      "kl": 1.2882408559322358,
      "learning_rate": 2.6205273069679853e-06,
      "loss": 0.0013,
      "num_tokens": 3951066.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3120
    },
    {
      "completion_length": 12.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.5,
      "completions/max_terminated_length": 16.5,
      "completions/mean_length": 12.225,
      "completions/mean_terminated_length": 12.225,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.21523861917205336,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007941076532006264,
      "kl": 1.1894244372844696,
      "learning_rate": 2.6111111111111113e-06,
      "loss": 0.0012,
      "num_tokens": 3962491.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3130
    },
    {
      "completion_length": 14.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 26.0,
      "completions/max_terminated_length": 26.0,
      "completions/mean_length": 14.025,
      "completions/mean_terminated_length": 14.025,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.21592628249209186,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 2.529683828353882,
      "kl": 1.2520922303199769,
      "learning_rate": 2.6016949152542377e-06,
      "loss": 0.0013,
      "num_tokens": 3974576.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3140
    },
    {
      "completion_length": 13.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 13.15,
      "completions/mean_terminated_length": 13.15,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.2166139458121304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029135840013623238,
      "kl": 1.076604688167572,
      "learning_rate": 2.5922787193973636e-06,
      "loss": 0.0011,
      "num_tokens": 3986806.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3150
    },
    {
      "completion_length": 13.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 13.6,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.2173016091321689,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00409655412659049,
      "kl": 1.1538120567798615,
      "learning_rate": 2.5828625235404896e-06,
      "loss": 0.0012,
      "num_tokens": 3999858.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3160
    },
    {
      "completion_length": 11.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.2179892724522074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003736306680366397,
      "kl": 1.0594617307186127,
      "learning_rate": 2.573446327683616e-06,
      "loss": 0.0011,
      "num_tokens": 4012533.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3170
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.2186769357722459,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004909292794764042,
      "kl": 1.300600254535675,
      "learning_rate": 2.5640301318267423e-06,
      "loss": 0.0013,
      "num_tokens": 4025381.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3180
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.21936459909228442,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003566068597137928,
      "kl": 1.2702523350715638,
      "learning_rate": 2.5546139359698683e-06,
      "loss": 0.0013,
      "num_tokens": 4036896.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3190
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.22005226241232292,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020172731950879097,
      "kl": 1.0868790686130523,
      "learning_rate": 2.5451977401129947e-06,
      "loss": 0.0011,
      "num_tokens": 4050002.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3200
    },
    {
      "completion_length": 11.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.325,
      "completions/mean_terminated_length": 11.325,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.22073992573236143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004111476242542267,
      "kl": 1.1443917155265808,
      "learning_rate": 2.5357815442561206e-06,
      "loss": 0.0011,
      "num_tokens": 4064027.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3210
    },
    {
      "completion_length": 15.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 32.1,
      "completions/max_terminated_length": 32.1,
      "completions/mean_length": 15.975,
      "completions/mean_terminated_length": 15.975,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.22142758905239995,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.2132339477539062,
      "kl": 1.1676890075206756,
      "learning_rate": 2.5263653483992466e-06,
      "loss": 0.0012,
      "num_tokens": 4077274.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3220
    },
    {
      "completion_length": 10.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 10.9,
      "completions/mean_terminated_length": 10.9,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.22211525237243845,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.017527904361486435,
      "kl": 1.3145560443401336,
      "learning_rate": 2.516949152542373e-06,
      "loss": 0.0013,
      "num_tokens": 4089186.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3230
    },
    {
      "completion_length": 12.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.22280291569247695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005846535321325064,
      "kl": 1.8368690073490144,
      "learning_rate": 2.5075329566854994e-06,
      "loss": 0.0018,
      "num_tokens": 4101546.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3240
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.22349057901251548,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.06038391590118408,
      "kl": 1.3158239006996155,
      "learning_rate": 2.4981167608286257e-06,
      "loss": 0.0013,
      "num_tokens": 4114641.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3250
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.22417824233255398,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027491487562656403,
      "kl": 1.2638065814971924,
      "learning_rate": 2.4887005649717517e-06,
      "loss": 0.0013,
      "num_tokens": 4126064.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3260
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.22486590565259248,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021324739791452885,
      "kl": 1.341496205329895,
      "learning_rate": 2.4792843691148776e-06,
      "loss": 0.0013,
      "num_tokens": 4137723.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3270
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.225553568972631,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002356611890718341,
      "kl": 1.2188643753528594,
      "learning_rate": 2.469868173258004e-06,
      "loss": 0.0012,
      "num_tokens": 4149288.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3280
    },
    {
      "completion_length": 10.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.975,
      "completions/mean_terminated_length": 10.975,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.22624123229266951,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019462181953713298,
      "kl": 1.0421675980091094,
      "learning_rate": 2.46045197740113e-06,
      "loss": 0.001,
      "num_tokens": 4162103.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3290
    },
    {
      "completion_length": 20.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.1,
      "completions/max_terminated_length": 49.1,
      "completions/mean_length": 20.7,
      "completions/mean_terminated_length": 20.7,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.22692889561270801,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004338334780186415,
      "kl": 1.2982787430286407,
      "learning_rate": 2.4510357815442564e-06,
      "loss": 0.0013,
      "num_tokens": 4174743.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3300
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.22761655893274652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005578485317528248,
      "kl": 1.3092613160610198,
      "learning_rate": 2.4416195856873827e-06,
      "loss": 0.0013,
      "num_tokens": 4186514.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3310
    },
    {
      "completion_length": 11.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.75,
      "completions/mean_terminated_length": 11.75,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.22830422225278504,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.03091367706656456,
      "kl": 1.1664791226387023,
      "learning_rate": 2.4322033898305087e-06,
      "loss": 0.0012,
      "num_tokens": 4199232.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3320
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.22899188557282354,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007058318238705397,
      "kl": 1.1536859154701233,
      "learning_rate": 2.422787193973635e-06,
      "loss": 0.0012,
      "num_tokens": 4212258.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3330
    },
    {
      "completion_length": 10.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.8,
      "completions/max_terminated_length": 12.8,
      "completions/mean_length": 10.4,
      "completions/mean_terminated_length": 10.4,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.22967954889286205,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004380435682833195,
      "kl": 1.3366443276405335,
      "learning_rate": 2.413370998116761e-06,
      "loss": 0.0013,
      "num_tokens": 4225234.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3340
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.23036721221290057,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004277032800018787,
      "kl": 1.1180653333663941,
      "learning_rate": 2.403954802259887e-06,
      "loss": 0.0011,
      "num_tokens": 4235825.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3350
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.23105487553293907,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005623528268188238,
      "kl": 1.3497309505939483,
      "learning_rate": 2.3945386064030134e-06,
      "loss": 0.0013,
      "num_tokens": 4249708.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3360
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.23174253885297758,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0034109626431018114,
      "kl": 1.3263630867004395,
      "learning_rate": 2.3851224105461398e-06,
      "loss": 0.0013,
      "num_tokens": 4262546.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3370
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.2324302021730161,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002905854256823659,
      "kl": 1.2896348893642426,
      "learning_rate": 2.3757062146892657e-06,
      "loss": 0.0013,
      "num_tokens": 4276578.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3380
    },
    {
      "completion_length": 12.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.325,
      "completions/mean_terminated_length": 12.325,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.2331178654930546,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002098673954606056,
      "kl": 1.4132690966129302,
      "learning_rate": 2.366290018832392e-06,
      "loss": 0.0014,
      "num_tokens": 4287999.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3390
    },
    {
      "completion_length": 14.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 30.1,
      "completions/max_terminated_length": 30.1,
      "completions/mean_length": 14.6,
      "completions/mean_terminated_length": 14.6,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.2338055288130931,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0030663420911878347,
      "kl": 1.9233894765377044,
      "learning_rate": 2.356873822975518e-06,
      "loss": 0.0019,
      "num_tokens": 4299531.0,
      "reward": 5.85,
      "reward_std": 0.12247449159622192,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.05773502588272095,
      "rewards/check_response_quality/mean": 2.4375,
      "rewards/check_response_quality/std": 0.047871357202529906,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3400
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.2344931921331316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009669655933976173,
      "kl": 1.1667460262775422,
      "learning_rate": 2.347457627118644e-06,
      "loss": 0.0012,
      "num_tokens": 4311467.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3410
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.23518085545317013,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004496072884649038,
      "kl": 1.3729178309440613,
      "learning_rate": 2.3380414312617704e-06,
      "loss": 0.0014,
      "num_tokens": 4322375.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3420
    },
    {
      "completion_length": 12.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 12.5,
      "completions/mean_terminated_length": 12.5,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.23586851877320864,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003123059868812561,
      "kl": 1.3280323922634125,
      "learning_rate": 2.3286252354048968e-06,
      "loss": 0.0013,
      "num_tokens": 4334663.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3430
    },
    {
      "completion_length": 12.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.7,
      "completions/max_terminated_length": 20.7,
      "completions/mean_length": 12.95,
      "completions/mean_terminated_length": 12.95,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.23655618209324714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00793770793825388,
      "kl": 1.1402764916419983,
      "learning_rate": 2.3192090395480227e-06,
      "loss": 0.0011,
      "num_tokens": 4346017.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3440
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.23724384541328566,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027919018175452948,
      "kl": 1.099853754043579,
      "learning_rate": 2.309792843691149e-06,
      "loss": 0.0011,
      "num_tokens": 4359793.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3450
    },
    {
      "completion_length": 10.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.8,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 10.85,
      "completions/mean_terminated_length": 10.85,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.23793150873332417,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0067101349122822285,
      "kl": 1.4456099629402162,
      "learning_rate": 2.300376647834275e-06,
      "loss": 0.0014,
      "num_tokens": 4373115.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3460
    },
    {
      "completion_length": 13.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 23.1,
      "completions/max_terminated_length": 23.1,
      "completions/mean_length": 13.125,
      "completions/mean_terminated_length": 13.125,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.23861917205336267,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.07878517359495163,
      "kl": 2.221970522403717,
      "learning_rate": 2.290960451977401e-06,
      "loss": 0.0022,
      "num_tokens": 4386292.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3470
    },
    {
      "completion_length": 12.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.45,
      "completions/mean_terminated_length": 12.45,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.2393068353734012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027336543425917625,
      "kl": 1.2365751266479492,
      "learning_rate": 2.2815442561205274e-06,
      "loss": 0.0012,
      "num_tokens": 4398422.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3480
    },
    {
      "completion_length": 12.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 12.25,
      "completions/mean_terminated_length": 12.25,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.2399944986934397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021766237914562225,
      "kl": 1.2284077286720276,
      "learning_rate": 2.2721280602636538e-06,
      "loss": 0.0012,
      "num_tokens": 4410376.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3490
    },
    {
      "completion_length": 10.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.925,
      "completions/mean_terminated_length": 10.925,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.2406821620134782,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0021278143394738436,
      "kl": 1.2818358659744262,
      "learning_rate": 2.26271186440678e-06,
      "loss": 0.0013,
      "num_tokens": 4422425.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3500
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.2413698253335167,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027870163321495056,
      "kl": 1.248415756225586,
      "learning_rate": 2.253295668549906e-06,
      "loss": 0.0012,
      "num_tokens": 4433021.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3510
    },
    {
      "completion_length": 12.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.2,
      "completions/max_terminated_length": 18.2,
      "completions/mean_length": 12.675,
      "completions/mean_terminated_length": 12.675,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.24205748865355523,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009928030893206596,
      "kl": 1.2978311777114868,
      "learning_rate": 2.243879472693032e-06,
      "loss": 0.0013,
      "num_tokens": 4446060.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3520
    },
    {
      "completion_length": 13.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.2,
      "completions/max_terminated_length": 21.2,
      "completions/mean_length": 13.65,
      "completions/mean_terminated_length": 13.65,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.24274515197359373,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.028688477352261543,
      "kl": 1.277863186597824,
      "learning_rate": 2.2344632768361585e-06,
      "loss": 0.0013,
      "num_tokens": 4458562.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3530
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.24343281529363223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037820255383849144,
      "kl": 1.2518242657184602,
      "learning_rate": 2.2250470809792844e-06,
      "loss": 0.0013,
      "num_tokens": 4469756.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3540
    },
    {
      "completion_length": 50.75,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 171.3,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 50.75,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.24412047861367075,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025780112482607365,
      "kl": 1.1971634149551391,
      "learning_rate": 2.215630885122411e-06,
      "loss": 0.0012,
      "num_tokens": 4484142.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3550
    },
    {
      "completion_length": 15.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 30.9,
      "completions/max_terminated_length": 30.9,
      "completions/mean_length": 15.85,
      "completions/mean_terminated_length": 15.85,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.24480814193370926,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0080100167542696,
      "kl": 1.3040795743465423,
      "learning_rate": 2.206214689265537e-06,
      "loss": 0.0013,
      "num_tokens": 4497160.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3560
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.6,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.24549580525374776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027778451330959797,
      "kl": 1.283364176750183,
      "learning_rate": 2.196798493408663e-06,
      "loss": 0.0013,
      "num_tokens": 4507981.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3570
    },
    {
      "completion_length": 11.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.325,
      "completions/mean_terminated_length": 11.325,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.24618346857378628,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005003831349313259,
      "kl": 1.2212236881256104,
      "learning_rate": 2.187382297551789e-06,
      "loss": 0.0012,
      "num_tokens": 4520882.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3580
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.24687113189382479,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011205293238162994,
      "kl": 1.1657124042510987,
      "learning_rate": 2.1779661016949155e-06,
      "loss": 0.0012,
      "num_tokens": 4530921.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3590
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.5,
      "completions/max_terminated_length": 12.5,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.2475587952138633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003226573346182704,
      "kl": 1.4166472613811494,
      "learning_rate": 2.1685499058380414e-06,
      "loss": 0.0014,
      "num_tokens": 4545070.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3600
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.24824645853390181,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015919512370601296,
      "kl": 1.390343391895294,
      "learning_rate": 2.159133709981168e-06,
      "loss": 0.0014,
      "num_tokens": 4557447.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3610
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.24893412185394032,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003537840908393264,
      "kl": 1.4276194095611572,
      "learning_rate": 2.149717514124294e-06,
      "loss": 0.0014,
      "num_tokens": 4569357.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3620
    },
    {
      "completion_length": 11.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.775,
      "completions/mean_terminated_length": 11.775,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.24962178517397882,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010827281512320042,
      "kl": 1.2425177216529846,
      "learning_rate": 2.14030131826742e-06,
      "loss": 0.0012,
      "num_tokens": 4583028.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3630
    },
    {
      "completion_length": 16.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.2,
      "completions/max_terminated_length": 31.2,
      "completions/mean_length": 16.0,
      "completions/mean_terminated_length": 16.0,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.2503094484940173,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.019427252933382988,
      "kl": 1.2630140125751494,
      "learning_rate": 2.130885122410546e-06,
      "loss": 0.0013,
      "num_tokens": 4595408.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3640
    },
    {
      "completion_length": 13.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.3,
      "completions/max_terminated_length": 20.3,
      "completions/mean_length": 13.7,
      "completions/mean_terminated_length": 13.7,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.2509971118140558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003992835059762001,
      "kl": 1.285544329881668,
      "learning_rate": 2.1214689265536725e-06,
      "loss": 0.0013,
      "num_tokens": 4607900.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3650
    },
    {
      "completion_length": 14.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.8,
      "completions/max_terminated_length": 21.8,
      "completions/mean_length": 14.2,
      "completions/mean_terminated_length": 14.2,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2516847751340944,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025455814320594072,
      "kl": 1.118562602996826,
      "learning_rate": 2.1120527306967984e-06,
      "loss": 0.0011,
      "num_tokens": 4619776.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3660
    },
    {
      "completion_length": 13.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.9,
      "completions/max_terminated_length": 21.9,
      "completions/mean_length": 13.925,
      "completions/mean_terminated_length": 13.925,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.2523724384541329,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005908417049795389,
      "kl": 1.1965693056583404,
      "learning_rate": 2.102636534839925e-06,
      "loss": 0.0012,
      "num_tokens": 4632933.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3670
    },
    {
      "completion_length": 12.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.1,
      "completions/max_terminated_length": 18.1,
      "completions/mean_length": 12.15,
      "completions/mean_terminated_length": 12.15,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.2530601017741714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004306701943278313,
      "kl": 1.1101278483867645,
      "learning_rate": 2.093220338983051e-06,
      "loss": 0.0011,
      "num_tokens": 4646659.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3680
    },
    {
      "completion_length": 30.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 90.2,
      "completions/max_terminated_length": 90.2,
      "completions/mean_length": 30.85,
      "completions/mean_terminated_length": 30.85,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.2537477650942099,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.0031726714223623276,
      "kl": 0.928079804778099,
      "learning_rate": 2.083804143126177e-06,
      "loss": 0.0009,
      "num_tokens": 4661733.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.1,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3690
    },
    {
      "completion_length": 18.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 40.9,
      "completions/max_terminated_length": 40.9,
      "completions/mean_length": 18.325,
      "completions/mean_terminated_length": 18.325,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.2544354284142484,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003087216755375266,
      "kl": 1.2326574087142945,
      "learning_rate": 2.0743879472693035e-06,
      "loss": 0.0012,
      "num_tokens": 4672266.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3700
    },
    {
      "completion_length": 11.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.525,
      "completions/mean_terminated_length": 11.525,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.2551230917342869,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020335062872618437,
      "kl": 1.298705518245697,
      "learning_rate": 2.0649717514124295e-06,
      "loss": 0.0013,
      "num_tokens": 4685299.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3710
    },
    {
      "completion_length": 16.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 37.3,
      "completions/max_terminated_length": 37.3,
      "completions/mean_length": 16.55,
      "completions/mean_terminated_length": 16.55,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.2558107550543254,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.01009492576122284,
      "kl": 1.3413202583789825,
      "learning_rate": 2.0555555555555555e-06,
      "loss": 0.0013,
      "num_tokens": 4698157.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3720
    },
    {
      "completion_length": 10.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 10.7,
      "completions/mean_terminated_length": 10.7,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.25649841837436393,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003856506897136569,
      "kl": 1.3558381140232085,
      "learning_rate": 2.046139359698682e-06,
      "loss": 0.0014,
      "num_tokens": 4708737.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3730
    },
    {
      "completion_length": 10.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 10.65,
      "completions/mean_terminated_length": 10.65,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.25718608169440244,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032971419859677553,
      "kl": 1.7668309211730957,
      "learning_rate": 2.036723163841808e-06,
      "loss": 0.0018,
      "num_tokens": 4720975.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3740
    },
    {
      "completion_length": 13.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.5,
      "completions/max_terminated_length": 19.5,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.25787374501444094,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.016410939395427704,
      "kl": 1.5257445216178893,
      "learning_rate": 2.027306967984934e-06,
      "loss": 0.0015,
      "num_tokens": 4732599.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3750
    },
    {
      "completion_length": 10.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 10.575,
      "completions/mean_terminated_length": 10.575,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.25856140833447944,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026505778077989817,
      "kl": 1.4891623914241792,
      "learning_rate": 2.0178907721280605e-06,
      "loss": 0.0015,
      "num_tokens": 4745006.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3760
    },
    {
      "completion_length": 11.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.75,
      "completions/mean_terminated_length": 11.75,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.25924907165451794,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0036013510543853045,
      "kl": 1.337304413318634,
      "learning_rate": 2.0084745762711865e-06,
      "loss": 0.0013,
      "num_tokens": 4758060.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3770
    },
    {
      "completion_length": 10.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.5,
      "completions/max_terminated_length": 13.5,
      "completions/mean_length": 10.95,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.25993673497455644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0065917568281292915,
      "kl": 1.379406750202179,
      "learning_rate": 1.9990583804143125e-06,
      "loss": 0.0014,
      "num_tokens": 4770086.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3780
    },
    {
      "completion_length": 16.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 33.4,
      "completions/max_terminated_length": 33.4,
      "completions/mean_length": 16.125,
      "completions/mean_terminated_length": 16.125,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.26062439829459494,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.601729393005371,
      "kl": 1.3989966630935669,
      "learning_rate": 1.989642184557439e-06,
      "loss": 0.0014,
      "num_tokens": 4782599.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3790
    },
    {
      "completion_length": 11.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.025,
      "completions/mean_terminated_length": 11.025,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.2613120616146335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006292569916695356,
      "kl": 1.5066055417060853,
      "learning_rate": 1.9802259887005652e-06,
      "loss": 0.0015,
      "num_tokens": 4793128.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3800
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.261999724934672,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013005614280700684,
      "kl": 1.2771723389625549,
      "learning_rate": 1.970809792843691e-06,
      "loss": 0.0013,
      "num_tokens": 4806370.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3810
    },
    {
      "completion_length": 10.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 11.9,
      "completions/max_terminated_length": 11.9,
      "completions/mean_length": 10.1,
      "completions/mean_terminated_length": 10.1,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.2626873882547105,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003991563804447651,
      "kl": 1.386504888534546,
      "learning_rate": 1.9613935969868176e-06,
      "loss": 0.0014,
      "num_tokens": 4817862.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3820
    },
    {
      "completion_length": 24.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 66.4,
      "completions/max_terminated_length": 66.4,
      "completions/mean_length": 24.625,
      "completions/mean_terminated_length": 24.625,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.263375051574749,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007266217842698097,
      "kl": 1.4715389907360077,
      "learning_rate": 1.9519774011299435e-06,
      "loss": 0.0015,
      "num_tokens": 4830747.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3830
    },
    {
      "completion_length": 11.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 11.75,
      "completions/mean_terminated_length": 11.75,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.2640627148947875,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025213544722646475,
      "kl": 1.2671421408653258,
      "learning_rate": 1.94256120527307e-06,
      "loss": 0.0013,
      "num_tokens": 4841201.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3840
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.264750378214826,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0049027493223547935,
      "kl": 1.5967715799808502,
      "learning_rate": 1.933145009416196e-06,
      "loss": 0.0016,
      "num_tokens": 4852968.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3850
    },
    {
      "completion_length": 11.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.125,
      "completions/mean_terminated_length": 11.125,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.26543804153486455,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005673303734511137,
      "kl": 1.341913378238678,
      "learning_rate": 1.9237288135593222e-06,
      "loss": 0.0013,
      "num_tokens": 4865945.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3860
    },
    {
      "completion_length": 11.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.6,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.26612570485490306,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002212796825915575,
      "kl": 1.410479474067688,
      "learning_rate": 1.9143126177024486e-06,
      "loss": 0.0014,
      "num_tokens": 4878421.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3870
    },
    {
      "completion_length": 10.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.5,
      "completions/max_terminated_length": 13.5,
      "completions/mean_length": 10.625,
      "completions/mean_terminated_length": 10.625,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.26681336817494156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033141798339784145,
      "kl": 1.7209568858146667,
      "learning_rate": 1.9048964218455746e-06,
      "loss": 0.0017,
      "num_tokens": 4892258.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3880
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.26750103149498006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007818554528057575,
      "kl": 1.5238799691200255,
      "learning_rate": 1.8954802259887005e-06,
      "loss": 0.0015,
      "num_tokens": 4904961.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3890
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.26818869481501856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00452169356867671,
      "kl": 1.5174754858016968,
      "learning_rate": 1.886064030131827e-06,
      "loss": 0.0015,
      "num_tokens": 4916848.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3900
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.26887635813505706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0015657467301934958,
      "kl": 1.440461552143097,
      "learning_rate": 1.876647834274953e-06,
      "loss": 0.0014,
      "num_tokens": 4929564.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3910
    },
    {
      "completion_length": 10.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.175,
      "completions/mean_terminated_length": 10.175,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.26956402145509556,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002364358166232705,
      "kl": 1.3664440631866455,
      "learning_rate": 1.867231638418079e-06,
      "loss": 0.0014,
      "num_tokens": 4941655.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3920
    },
    {
      "completion_length": 10.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.65,
      "completions/mean_terminated_length": 10.65,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.2702516847751341,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003972493577748537,
      "kl": 1.4133961200714111,
      "learning_rate": 1.8578154425612054e-06,
      "loss": 0.0014,
      "num_tokens": 4954969.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3930
    },
    {
      "completion_length": 17.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 40.2,
      "completions/max_terminated_length": 40.2,
      "completions/mean_length": 17.25,
      "completions/mean_terminated_length": 17.25,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.2709393480951726,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.008966933004558086,
      "kl": 1.941703236103058,
      "learning_rate": 1.8483992467043316e-06,
      "loss": 0.0019,
      "num_tokens": 4967071.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3940
    },
    {
      "completion_length": 11.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.5,
      "completions/mean_terminated_length": 11.5,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.2716270114152111,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010819694958627224,
      "kl": 1.5476664900779724,
      "learning_rate": 1.8389830508474578e-06,
      "loss": 0.0015,
      "num_tokens": 4980575.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3950
    },
    {
      "completion_length": 10.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 10.725,
      "completions/mean_terminated_length": 10.725,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.2723146747352496,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003213522955775261,
      "kl": 1.3780342757701873,
      "learning_rate": 1.829566854990584e-06,
      "loss": 0.0014,
      "num_tokens": 4993112.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3960
    },
    {
      "completion_length": 10.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.8,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 10.775,
      "completions/mean_terminated_length": 10.775,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.2730023380552881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0077873156405985355,
      "kl": 1.4218901097774506,
      "learning_rate": 1.82015065913371e-06,
      "loss": 0.0014,
      "num_tokens": 5004699.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3970
    },
    {
      "completion_length": 10.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 10.475,
      "completions/mean_terminated_length": 10.475,
      "completions/min_length": 7.3,
      "completions/min_terminated_length": 7.3,
      "epoch": 0.2736900013753266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00725904107093811,
      "kl": 1.928431499004364,
      "learning_rate": 1.8107344632768365e-06,
      "loss": 0.0019,
      "num_tokens": 5018150.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3980
    },
    {
      "completion_length": 9.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.6,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 9.925,
      "completions/mean_terminated_length": 9.925,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.2743776646953652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01053251139819622,
      "kl": 1.701702892780304,
      "learning_rate": 1.8013182674199624e-06,
      "loss": 0.0017,
      "num_tokens": 5030671.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 3990
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.2750653280154037,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00460396520793438,
      "kl": 1.322218155860901,
      "learning_rate": 1.7919020715630886e-06,
      "loss": 0.0013,
      "num_tokens": 5043143.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4000
    },
    {
      "completion_length": 19.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 48.9,
      "completions/max_terminated_length": 48.9,
      "completions/mean_length": 19.275,
      "completions/mean_terminated_length": 19.275,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.2757529913354422,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.04520512372255325,
      "kl": 1.6543179631233216,
      "learning_rate": 1.782485875706215e-06,
      "loss": 0.0017,
      "num_tokens": 5056954.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4010
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.2764406546554807,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.04019185155630112,
      "kl": 1.6368863046169282,
      "learning_rate": 1.773069679849341e-06,
      "loss": 0.0016,
      "num_tokens": 5067586.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4020
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.6,
      "completions/max_terminated_length": 18.6,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.2771283179755192,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.003386999014765024,
      "kl": 1.5015459895133971,
      "learning_rate": 1.763653483992467e-06,
      "loss": 0.0015,
      "num_tokens": 5079692.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4030
    },
    {
      "completion_length": 10.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 10.275,
      "completions/mean_terminated_length": 10.275,
      "completions/min_length": 7.4,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.2778159812955577,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005510615184903145,
      "kl": 1.6972654938697815,
      "learning_rate": 1.7542372881355935e-06,
      "loss": 0.0017,
      "num_tokens": 5091063.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4040
    },
    {
      "completion_length": 10.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 10.975,
      "completions/mean_terminated_length": 10.975,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.2785036446155962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004958089906722307,
      "kl": 1.593740427494049,
      "learning_rate": 1.7448210922787194e-06,
      "loss": 0.0016,
      "num_tokens": 5103514.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4050
    },
    {
      "completion_length": 9.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 9.2,
      "completions/mean_terminated_length": 9.2,
      "completions/min_length": 6.9,
      "completions/min_terminated_length": 6.9,
      "epoch": 0.27919130793563474,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005375854205340147,
      "kl": 1.9296481251716613,
      "learning_rate": 1.7354048964218456e-06,
      "loss": 0.0019,
      "num_tokens": 5114922.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4060
    },
    {
      "completion_length": 10.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.025,
      "completions/mean_terminated_length": 10.025,
      "completions/min_length": 7.2,
      "completions/min_terminated_length": 7.2,
      "epoch": 0.27987897125567324,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.027488671243190765,
      "kl": 1.7945331156253814,
      "learning_rate": 1.725988700564972e-06,
      "loss": 0.0018,
      "num_tokens": 5127735.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4070
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.28056663457571174,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005040109623223543,
      "kl": 2.0170140087604524,
      "learning_rate": 1.7165725047080982e-06,
      "loss": 0.002,
      "num_tokens": 5140555.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4080
    },
    {
      "completion_length": 10.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 10.825,
      "completions/mean_terminated_length": 10.825,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.28125429789575024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008321479894220829,
      "kl": 1.7255867421627045,
      "learning_rate": 1.7071563088512241e-06,
      "loss": 0.0017,
      "num_tokens": 5152752.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4090
    },
    {
      "completion_length": 14.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 28.0,
      "completions/max_terminated_length": 28.0,
      "completions/mean_length": 14.425,
      "completions/mean_terminated_length": 14.425,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.28194196121578874,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 5.667783737182617,
      "kl": 1.611900508403778,
      "learning_rate": 1.6977401129943505e-06,
      "loss": 0.0016,
      "num_tokens": 5164429.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4100
    },
    {
      "completion_length": 10.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.2,
      "completions/mean_terminated_length": 10.2,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.28262962453582724,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034870856907218695,
      "kl": 1.443020796775818,
      "learning_rate": 1.6883239171374767e-06,
      "loss": 0.0014,
      "num_tokens": 5176221.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4110
    },
    {
      "completion_length": 10.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.35,
      "completions/mean_terminated_length": 10.35,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.28331728785586574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006099638994783163,
      "kl": 1.7443130373954774,
      "learning_rate": 1.6789077212806026e-06,
      "loss": 0.0017,
      "num_tokens": 5189343.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4120
    },
    {
      "completion_length": 9.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 9.775,
      "completions/mean_terminated_length": 9.775,
      "completions/min_length": 7.2,
      "completions/min_terminated_length": 7.2,
      "epoch": 0.2840049511759043,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006439814809709787,
      "kl": 1.7517601370811462,
      "learning_rate": 1.669491525423729e-06,
      "loss": 0.0018,
      "num_tokens": 5200602.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4130
    },
    {
      "completion_length": 13.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 25.0,
      "completions/max_terminated_length": 25.0,
      "completions/mean_length": 13.075,
      "completions/mean_terminated_length": 13.075,
      "completions/min_length": 7.4,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.2846926144959428,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.003682193113490939,
      "kl": 1.5702327251434327,
      "learning_rate": 1.6600753295668552e-06,
      "loss": 0.0016,
      "num_tokens": 5212097.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4140
    },
    {
      "completion_length": 17.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 40.6,
      "completions/max_terminated_length": 40.6,
      "completions/mean_length": 17.75,
      "completions/mean_terminated_length": 17.75,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.2853802778159813,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007733450271189213,
      "kl": 1.5168497264385223,
      "learning_rate": 1.6506591337099813e-06,
      "loss": 0.0015,
      "num_tokens": 5223511.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4150
    },
    {
      "completion_length": 11.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.075,
      "completions/mean_terminated_length": 11.075,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.2860679411360198,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005073005799204111,
      "kl": 1.4336557388305664,
      "learning_rate": 1.6412429378531075e-06,
      "loss": 0.0014,
      "num_tokens": 5235182.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4160
    },
    {
      "completion_length": 11.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.3,
      "completions/max_terminated_length": 18.3,
      "completions/mean_length": 11.5,
      "completions/mean_terminated_length": 11.5,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.2867556044560583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0042126537300646305,
      "kl": 1.6670551657676698,
      "learning_rate": 1.6318267419962337e-06,
      "loss": 0.0017,
      "num_tokens": 5246558.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4170
    },
    {
      "completion_length": 10.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.9,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 10.825,
      "completions/mean_terminated_length": 10.825,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.2874432677760968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003699904540553689,
      "kl": 1.4559585928916932,
      "learning_rate": 1.6224105461393598e-06,
      "loss": 0.0015,
      "num_tokens": 5257303.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4180
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.28813093109613536,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.010834389366209507,
      "kl": 1.4884744346141816,
      "learning_rate": 1.612994350282486e-06,
      "loss": 0.0015,
      "num_tokens": 5269937.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4190
    },
    {
      "completion_length": 12.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.8,
      "completions/max_terminated_length": 19.8,
      "completions/mean_length": 12.875,
      "completions/mean_terminated_length": 12.875,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.28881859441617386,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006072077434509993,
      "kl": 1.4151581585407258,
      "learning_rate": 1.6035781544256122e-06,
      "loss": 0.0014,
      "num_tokens": 5284716.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4200
    },
    {
      "completion_length": 19.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 46.6,
      "completions/max_terminated_length": 46.6,
      "completions/mean_length": 19.5,
      "completions/mean_terminated_length": 19.5,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.28950625773621236,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.00288102007471025,
      "kl": 1.5231776535511017,
      "learning_rate": 1.5941619585687384e-06,
      "loss": 0.0015,
      "num_tokens": 5298764.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4210
    },
    {
      "completion_length": 10.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 10.2,
      "completions/mean_terminated_length": 10.2,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.29019392105625086,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004693563561886549,
      "kl": 1.6336079835891724,
      "learning_rate": 1.5847457627118645e-06,
      "loss": 0.0016,
      "num_tokens": 5309980.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4220
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.29088158437628936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02345423959195614,
      "kl": 1.402549785375595,
      "learning_rate": 1.5753295668549907e-06,
      "loss": 0.0014,
      "num_tokens": 5321771.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4230
    },
    {
      "completion_length": 11.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 11.95,
      "completions/mean_terminated_length": 11.95,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.29156924769632786,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005161237437278032,
      "kl": 1.5061214089393615,
      "learning_rate": 1.565913370998117e-06,
      "loss": 0.0015,
      "num_tokens": 5334045.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4240
    },
    {
      "completion_length": 10.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 10.7,
      "completions/mean_terminated_length": 10.7,
      "completions/min_length": 6.8,
      "completions/min_terminated_length": 6.8,
      "epoch": 0.29225691101636636,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008897802792489529,
      "kl": 2.1344063758850096,
      "learning_rate": 1.556497175141243e-06,
      "loss": 0.0021,
      "num_tokens": 5345873.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4250
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.2929445743364049,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015760373324155807,
      "kl": 1.5703859090805055,
      "learning_rate": 1.5470809792843692e-06,
      "loss": 0.0016,
      "num_tokens": 5358625.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4260
    },
    {
      "completion_length": 10.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 10.95,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.2936322376564434,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.04361697658896446,
      "kl": 1.5343574345111848,
      "learning_rate": 1.5376647834274956e-06,
      "loss": 0.0015,
      "num_tokens": 5370723.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4270
    },
    {
      "completion_length": 10.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 10.925,
      "completions/mean_terminated_length": 10.925,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.2943199009764819,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00989339780062437,
      "kl": 1.8978851437568665,
      "learning_rate": 1.5282485875706215e-06,
      "loss": 0.0019,
      "num_tokens": 5383060.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4280
    },
    {
      "completion_length": 50.4,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 170.8,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 50.4,
      "completions/mean_terminated_length": 11.15,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.2950075642965204,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025704088620841503,
      "kl": 1.435192084312439,
      "learning_rate": 1.5188323917137477e-06,
      "loss": 0.0014,
      "num_tokens": 5397712.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4290
    },
    {
      "completion_length": 11.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.05,
      "completions/mean_terminated_length": 11.05,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2956952276165589,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016429847106337547,
      "kl": 2.077840727567673,
      "learning_rate": 1.509416195856874e-06,
      "loss": 0.0021,
      "num_tokens": 5408798.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4300
    },
    {
      "completion_length": 11.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 11.775,
      "completions/mean_terminated_length": 11.775,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.2963828909365974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027806151192635298,
      "kl": 1.5844416201114655,
      "learning_rate": 1.5e-06,
      "loss": 0.0016,
      "num_tokens": 5421021.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4310
    },
    {
      "completion_length": 20.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 51.5,
      "completions/max_terminated_length": 51.5,
      "completions/mean_length": 20.375,
      "completions/mean_terminated_length": 20.375,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.297070554256636,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005918384529650211,
      "kl": 1.2520611345767976,
      "learning_rate": 1.4905838041431264e-06,
      "loss": 0.0013,
      "num_tokens": 5435752.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4320
    },
    {
      "completion_length": 19.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 48.8,
      "completions/max_terminated_length": 48.8,
      "completions/mean_length": 19.6,
      "completions/mean_terminated_length": 19.6,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.2977582175766745,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.004772708751261234,
      "kl": 1.4595943570137024,
      "learning_rate": 1.4811676082862526e-06,
      "loss": 0.0015,
      "num_tokens": 5449672.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4330
    },
    {
      "completion_length": 12.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.3,
      "completions/max_terminated_length": 19.3,
      "completions/mean_length": 12.225,
      "completions/mean_terminated_length": 12.225,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.298445880896713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.018670905381441116,
      "kl": 1.2710513174533844,
      "learning_rate": 1.4717514124293785e-06,
      "loss": 0.0013,
      "num_tokens": 5460621.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4340
    },
    {
      "completion_length": 13.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.6,
      "completions/max_terminated_length": 20.6,
      "completions/mean_length": 13.4,
      "completions/mean_terminated_length": 13.4,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.2991335442167515,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.123333692550659,
      "kl": 1.4008326411247254,
      "learning_rate": 1.462335216572505e-06,
      "loss": 0.0014,
      "num_tokens": 5472757.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4350
    },
    {
      "completion_length": 11.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.29982120753679,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.12061981111764908,
      "kl": 1.3641066908836366,
      "learning_rate": 1.452919020715631e-06,
      "loss": 0.0014,
      "num_tokens": 5483908.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4360
    },
    {
      "completion_length": 12.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.7,
      "completions/max_terminated_length": 21.7,
      "completions/mean_length": 12.925,
      "completions/mean_terminated_length": 12.925,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.3005088708568285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004400401841849089,
      "kl": 1.4162483930587768,
      "learning_rate": 1.443502824858757e-06,
      "loss": 0.0014,
      "num_tokens": 5495553.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4370
    },
    {
      "completion_length": 12.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.7,
      "completions/max_terminated_length": 19.7,
      "completions/mean_length": 12.775,
      "completions/mean_terminated_length": 12.775,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.301196534176867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01428204495459795,
      "kl": 1.3453535318374634,
      "learning_rate": 1.4340866290018834e-06,
      "loss": 0.0013,
      "num_tokens": 5505996.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4380
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.9,
      "completions/max_terminated_length": 21.9,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.30188419749690554,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 3.6220338344573975,
      "kl": 1.4408978760242461,
      "learning_rate": 1.4246704331450096e-06,
      "loss": 0.0014,
      "num_tokens": 5517849.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4390
    },
    {
      "completion_length": 12.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.1,
      "completions/max_terminated_length": 20.1,
      "completions/mean_length": 12.7,
      "completions/mean_terminated_length": 12.7,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.30257186081694404,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009245552122592926,
      "kl": 1.8541507005691529,
      "learning_rate": 1.4152542372881356e-06,
      "loss": 0.0019,
      "num_tokens": 5531473.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4400
    },
    {
      "completion_length": 10.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 10.775,
      "completions/mean_terminated_length": 10.775,
      "completions/min_length": 7.4,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.30325952413698254,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0041836886666715145,
      "kl": 1.533377969264984,
      "learning_rate": 1.405838041431262e-06,
      "loss": 0.0015,
      "num_tokens": 5542716.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4410
    },
    {
      "completion_length": 12.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 12.05,
      "completions/mean_terminated_length": 12.05,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.30394718745702104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003924133721739054,
      "kl": 1.1724998950958252,
      "learning_rate": 1.396421845574388e-06,
      "loss": 0.0012,
      "num_tokens": 5556754.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4420
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.5,
      "completions/max_terminated_length": 16.5,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.30463485077705954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006332984194159508,
      "kl": 1.4067680716514588,
      "learning_rate": 1.387005649717514e-06,
      "loss": 0.0014,
      "num_tokens": 5571041.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4430
    },
    {
      "completion_length": 13.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.0,
      "completions/max_terminated_length": 21.0,
      "completions/mean_length": 13.4,
      "completions/mean_terminated_length": 13.4,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.30532251409709804,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028947910759598017,
      "kl": 1.2639997959136964,
      "learning_rate": 1.3775894538606404e-06,
      "loss": 0.0013,
      "num_tokens": 5583229.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4440
    },
    {
      "completion_length": 15.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.9,
      "completions/max_terminated_length": 31.9,
      "completions/mean_length": 15.325,
      "completions/mean_terminated_length": 15.325,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.3060101774171366,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004690113477408886,
      "kl": 1.286442184448242,
      "learning_rate": 1.3681732580037666e-06,
      "loss": 0.0013,
      "num_tokens": 5593870.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4450
    },
    {
      "completion_length": 13.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.7,
      "completions/max_terminated_length": 21.7,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.3066978407371751,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.011687826365232468,
      "kl": 1.7261317849159241,
      "learning_rate": 1.358757062146893e-06,
      "loss": 0.0017,
      "num_tokens": 5608202.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4460
    },
    {
      "completion_length": 13.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.6,
      "completions/max_terminated_length": 19.6,
      "completions/mean_length": 13.25,
      "completions/mean_terminated_length": 13.25,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.3073855040572136,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00487670348957181,
      "kl": 1.3319970309734344,
      "learning_rate": 1.349340866290019e-06,
      "loss": 0.0013,
      "num_tokens": 5620112.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4470
    },
    {
      "completion_length": 9.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.8,
      "completions/max_terminated_length": 12.8,
      "completions/mean_length": 9.975,
      "completions/mean_terminated_length": 9.975,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.3080731673772521,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0082427728921175,
      "kl": 1.5232180416584016,
      "learning_rate": 1.3399246704331451e-06,
      "loss": 0.0015,
      "num_tokens": 5632167.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4480
    },
    {
      "completion_length": 13.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.6,
      "completions/max_terminated_length": 20.6,
      "completions/mean_length": 13.2,
      "completions/mean_terminated_length": 13.2,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3087608306972906,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0026437605265527964,
      "kl": 1.2098506152629853,
      "learning_rate": 1.3305084745762715e-06,
      "loss": 0.0012,
      "num_tokens": 5644367.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4490
    },
    {
      "completion_length": 11.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.5,
      "completions/mean_terminated_length": 11.5,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.3094484940173291,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.019921699538826942,
      "kl": 1.5676705598831178,
      "learning_rate": 1.3210922787193975e-06,
      "loss": 0.0016,
      "num_tokens": 5656659.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4500
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.3101361573373676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.019338268786668777,
      "kl": 2.288919413089752,
      "learning_rate": 1.3116760828625236e-06,
      "loss": 0.0023,
      "num_tokens": 5668666.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4510
    },
    {
      "completion_length": 10.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.1,
      "completions/mean_terminated_length": 10.1,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.31082382065740616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0050545958802104,
      "kl": 1.5871056616306305,
      "learning_rate": 1.30225988700565e-06,
      "loss": 0.0016,
      "num_tokens": 5683262.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4520
    },
    {
      "completion_length": 10.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.4,
      "completions/mean_terminated_length": 10.4,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.31151148397744466,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011630339547991753,
      "kl": 1.5014687776565552,
      "learning_rate": 1.292843691148776e-06,
      "loss": 0.0015,
      "num_tokens": 5695350.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4530
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.31219914729748316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004649888724088669,
      "kl": 1.3500673830509187,
      "learning_rate": 1.2834274952919021e-06,
      "loss": 0.0014,
      "num_tokens": 5706957.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4540
    },
    {
      "completion_length": 10.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.475,
      "completions/mean_terminated_length": 10.475,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.31288681061752166,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001390202553011477,
      "kl": 1.491336989402771,
      "learning_rate": 1.2740112994350285e-06,
      "loss": 0.0015,
      "num_tokens": 5718220.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4550
    },
    {
      "completion_length": 12.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.7,
      "completions/max_terminated_length": 19.7,
      "completions/mean_length": 12.775,
      "completions/mean_terminated_length": 12.775,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.31357447393756016,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004471197724342346,
      "kl": 1.2512777149677277,
      "learning_rate": 1.2645951035781545e-06,
      "loss": 0.0013,
      "num_tokens": 5730955.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4560
    },
    {
      "completion_length": 12.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 12.2,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.31426213725759866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021901508793234825,
      "kl": 1.457699954509735,
      "learning_rate": 1.2551789077212806e-06,
      "loss": 0.0015,
      "num_tokens": 5743123.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4570
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.31494980057763716,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005899836774915457,
      "kl": 1.377420848608017,
      "learning_rate": 1.245762711864407e-06,
      "loss": 0.0014,
      "num_tokens": 5755139.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4580
    },
    {
      "completion_length": 26.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 75.9,
      "completions/max_terminated_length": 75.9,
      "completions/mean_length": 26.875,
      "completions/mean_terminated_length": 26.875,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.3156374638976757,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006933213677257299,
      "kl": 1.3808945536613464,
      "learning_rate": 1.236346516007533e-06,
      "loss": 0.0014,
      "num_tokens": 5766890.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4590
    },
    {
      "completion_length": 10.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 10.975,
      "completions/mean_terminated_length": 10.975,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.3163251272177142,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008102341555058956,
      "kl": 1.4617763698101043,
      "learning_rate": 1.2269303201506591e-06,
      "loss": 0.0015,
      "num_tokens": 5777961.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4600
    },
    {
      "completion_length": 19.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 45.8,
      "completions/max_terminated_length": 45.8,
      "completions/mean_length": 19.825,
      "completions/mean_terminated_length": 19.825,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.3170127905377527,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.007490513846278191,
      "kl": 1.2782706379890443,
      "learning_rate": 1.2175141242937855e-06,
      "loss": 0.0013,
      "num_tokens": 5790118.0,
      "reward": 5.9375,
      "reward_std": 0.125,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4610
    },
    {
      "completion_length": 13.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.5,
      "completions/max_terminated_length": 19.5,
      "completions/mean_length": 13.075,
      "completions/mean_terminated_length": 13.075,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.3177004538577912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004942750558257103,
      "kl": 1.2929556488990783,
      "learning_rate": 1.2080979284369115e-06,
      "loss": 0.0013,
      "num_tokens": 5801985.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4620
    },
    {
      "completion_length": 18.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 42.6,
      "completions/max_terminated_length": 42.6,
      "completions/mean_length": 18.35,
      "completions/mean_terminated_length": 18.35,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.3183881171778297,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 1.5946123600006104,
      "kl": 1.4938054442405702,
      "learning_rate": 1.1986817325800379e-06,
      "loss": 0.0015,
      "num_tokens": 5815619.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4630
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3190757804978682,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00890259351581335,
      "kl": 1.2599179446697235,
      "learning_rate": 1.189265536723164e-06,
      "loss": 0.0013,
      "num_tokens": 5826956.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4640
    },
    {
      "completion_length": 11.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.6,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.3197634438179068,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011387999169528484,
      "kl": 2.2636526942253115,
      "learning_rate": 1.17984934086629e-06,
      "loss": 0.0023,
      "num_tokens": 5839092.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4650
    },
    {
      "completion_length": 12.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.6,
      "completions/max_terminated_length": 18.6,
      "completions/mean_length": 12.825,
      "completions/mean_terminated_length": 12.825,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.3204511071379453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004085191525518894,
      "kl": 1.345226788520813,
      "learning_rate": 1.1704331450094164e-06,
      "loss": 0.0013,
      "num_tokens": 5852409.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4660
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.3211387704579838,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0036149087827652693,
      "kl": 1.3768569946289062,
      "learning_rate": 1.1610169491525425e-06,
      "loss": 0.0014,
      "num_tokens": 5866463.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4670
    },
    {
      "completion_length": 21.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 57.8,
      "completions/max_terminated_length": 57.8,
      "completions/mean_length": 21.925,
      "completions/mean_terminated_length": 21.925,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3218264337780223,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.05678132548928261,
      "kl": 1.2918590784072876,
      "learning_rate": 1.1516007532956687e-06,
      "loss": 0.0013,
      "num_tokens": 5878300.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4680
    },
    {
      "completion_length": 11.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.1,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.3225140970980608,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005522090010344982,
      "kl": 1.47247673869133,
      "learning_rate": 1.1421845574387949e-06,
      "loss": 0.0015,
      "num_tokens": 5890220.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4690
    },
    {
      "completion_length": 11.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.775,
      "completions/mean_terminated_length": 11.775,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.3232017604180993,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025551936123520136,
      "kl": 1.3015658378601074,
      "learning_rate": 1.132768361581921e-06,
      "loss": 0.0013,
      "num_tokens": 5903907.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4700
    },
    {
      "completion_length": 10.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 10.85,
      "completions/mean_terminated_length": 10.85,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3238894237381378,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00985870510339737,
      "kl": 1.5268153309822083,
      "learning_rate": 1.1233521657250472e-06,
      "loss": 0.0015,
      "num_tokens": 5914369.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4710
    },
    {
      "completion_length": 13.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.7,
      "completions/max_terminated_length": 20.7,
      "completions/mean_length": 13.325,
      "completions/mean_terminated_length": 13.325,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.32457708705817634,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0380832813680172,
      "kl": 1.440810000896454,
      "learning_rate": 1.1139359698681734e-06,
      "loss": 0.0014,
      "num_tokens": 5926126.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4720
    },
    {
      "completion_length": 10.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.1,
      "completions/max_terminated_length": 13.1,
      "completions/mean_length": 10.3,
      "completions/mean_terminated_length": 10.3,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.32526475037821484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00719305407255888,
      "kl": 1.5471572160720826,
      "learning_rate": 1.1045197740112995e-06,
      "loss": 0.0015,
      "num_tokens": 5939830.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4730
    },
    {
      "completion_length": 11.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.45,
      "completions/mean_terminated_length": 11.45,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.32595241369825334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0119554428383708,
      "kl": 1.56171954870224,
      "learning_rate": 1.0951035781544257e-06,
      "loss": 0.0016,
      "num_tokens": 5953144.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4740
    },
    {
      "completion_length": 10.775,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 10.775,
      "completions/mean_terminated_length": 10.775,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.32664007701829184,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0054775746539235115,
      "kl": 1.531346207857132,
      "learning_rate": 1.0856873822975519e-06,
      "loss": 0.0015,
      "num_tokens": 5965163.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4750
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.32732774033833034,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020217234268784523,
      "kl": 1.3433250963687897,
      "learning_rate": 1.076271186440678e-06,
      "loss": 0.0013,
      "num_tokens": 5977441.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4760
    },
    {
      "completion_length": 26.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 71.5,
      "completions/max_terminated_length": 71.5,
      "completions/mean_length": 26.275,
      "completions/mean_terminated_length": 26.275,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.32801540365836884,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.012751337140798569,
      "kl": 1.5057712554931642,
      "learning_rate": 1.0668549905838042e-06,
      "loss": 0.0015,
      "num_tokens": 5989348.0,
      "reward": 5.9,
      "reward_std": 0.14574271440505981,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.053867512941360475,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.053867512941360475,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4770
    },
    {
      "completion_length": 16.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 36.5,
      "completions/max_terminated_length": 36.5,
      "completions/mean_length": 16.825,
      "completions/mean_terminated_length": 16.825,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.3287030669784074,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008169161155819893,
      "kl": 1.307063925266266,
      "learning_rate": 1.0574387947269304e-06,
      "loss": 0.0013,
      "num_tokens": 6001573.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4780
    },
    {
      "completion_length": 9.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 9.875,
      "completions/mean_terminated_length": 9.875,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.3293907302984459,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.022803228348493576,
      "kl": 1.3791126608848572,
      "learning_rate": 1.0480225988700566e-06,
      "loss": 0.0014,
      "num_tokens": 6013696.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4790
    },
    {
      "completion_length": 11.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 11.45,
      "completions/mean_terminated_length": 11.45,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.3300783936184844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01593186892569065,
      "kl": 1.3368689715862274,
      "learning_rate": 1.0386064030131827e-06,
      "loss": 0.0013,
      "num_tokens": 6026230.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4800
    },
    {
      "completion_length": 19.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 48.1,
      "completions/max_terminated_length": 48.1,
      "completions/mean_length": 19.125,
      "completions/mean_terminated_length": 19.125,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.3307660569385229,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002272400539368391,
      "kl": 2.082442098855972,
      "learning_rate": 1.029190207156309e-06,
      "loss": 0.0021,
      "num_tokens": 6038639.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4810
    },
    {
      "completion_length": 11.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.2,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3314537202585614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004652642644941807,
      "kl": 1.4440054893493652,
      "learning_rate": 1.0197740112994353e-06,
      "loss": 0.0014,
      "num_tokens": 6050235.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4820
    },
    {
      "completion_length": 11.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.075,
      "completions/mean_terminated_length": 11.075,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.3321413835785999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00583495432510972,
      "kl": 1.6347231984138488,
      "learning_rate": 1.0103578154425612e-06,
      "loss": 0.0016,
      "num_tokens": 6061518.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4830
    },
    {
      "completion_length": 10.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 10.85,
      "completions/mean_terminated_length": 10.85,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.3328290468986384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003644515760242939,
      "kl": 1.3461124837398528,
      "learning_rate": 1.0009416195856874e-06,
      "loss": 0.0013,
      "num_tokens": 6073212.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4840
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.33351671021867696,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006971648428589106,
      "kl": 1.4714319944381713,
      "learning_rate": 9.915254237288138e-07,
      "loss": 0.0015,
      "num_tokens": 6085061.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4850
    },
    {
      "completion_length": 32.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 100.4,
      "completions/max_terminated_length": 100.4,
      "completions/mean_length": 32.725,
      "completions/mean_terminated_length": 32.725,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.33420437353871546,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.010788694955408573,
      "kl": 1.3329748511314392,
      "learning_rate": 9.821092278719397e-07,
      "loss": 0.0013,
      "num_tokens": 6099130.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4860
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.33489203685875396,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002744455123320222,
      "kl": 2.1530386984348295,
      "learning_rate": 9.72693032015066e-07,
      "loss": 0.0022,
      "num_tokens": 6111409.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4870
    },
    {
      "completion_length": 12.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.9,
      "completions/mean_terminated_length": 12.9,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.33557970017879246,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004031331278383732,
      "kl": 1.3381536304950714,
      "learning_rate": 9.632768361581923e-07,
      "loss": 0.0013,
      "num_tokens": 6123997.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4880
    },
    {
      "completion_length": 10.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 10.7,
      "completions/mean_terminated_length": 10.7,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.33626736349883096,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.03266019746661186,
      "kl": 1.422250461578369,
      "learning_rate": 9.538606403013182e-07,
      "loss": 0.0014,
      "num_tokens": 6137029.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4890
    },
    {
      "completion_length": 26.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 77.0,
      "completions/max_terminated_length": 77.0,
      "completions/mean_length": 26.85,
      "completions/mean_terminated_length": 26.85,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.33695502681886946,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004609998781234026,
      "kl": 1.1960768818855285,
      "learning_rate": 9.444444444444445e-07,
      "loss": 0.0012,
      "num_tokens": 6151579.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4900
    },
    {
      "completion_length": 12.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.33764269013890796,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007132112048566341,
      "kl": 1.325413703918457,
      "learning_rate": 9.350282485875707e-07,
      "loss": 0.0013,
      "num_tokens": 6164076.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4910
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3383303534589465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005040779244154692,
      "kl": 1.277837687730789,
      "learning_rate": 9.25612052730697e-07,
      "loss": 0.0013,
      "num_tokens": 6175141.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4920
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.339018016778985,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004855050239712,
      "kl": 1.2279843807220459,
      "learning_rate": 9.16195856873823e-07,
      "loss": 0.0012,
      "num_tokens": 6187590.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4930
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3397056800990235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004113995004445314,
      "kl": 1.2801987171173095,
      "learning_rate": 9.067796610169492e-07,
      "loss": 0.0013,
      "num_tokens": 6199832.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4940
    },
    {
      "completion_length": 10.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.65,
      "completions/mean_terminated_length": 10.65,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.340393343419062,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.04914846643805504,
      "kl": 1.4876422524452209,
      "learning_rate": 8.973634651600755e-07,
      "loss": 0.0015,
      "num_tokens": 6213162.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4950
    },
    {
      "completion_length": 15.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 32.0,
      "completions/max_terminated_length": 32.0,
      "completions/mean_length": 15.8,
      "completions/mean_terminated_length": 15.8,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3410810067391005,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.011781705543398857,
      "kl": 1.1131984174251557,
      "learning_rate": 8.879472693032015e-07,
      "loss": 0.0011,
      "num_tokens": 6223426.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4960
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.341768670059139,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008672394789755344,
      "kl": 1.2104420125484467,
      "learning_rate": 8.785310734463277e-07,
      "loss": 0.0012,
      "num_tokens": 6235529.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4970
    },
    {
      "completion_length": 10.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 10.95,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.3424563333791776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006266096141189337,
      "kl": 1.310611402988434,
      "learning_rate": 8.69114877589454e-07,
      "loss": 0.0013,
      "num_tokens": 6245659.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4980
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3431439966992161,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012767767533659935,
      "kl": 1.3206363081932069,
      "learning_rate": 8.596986817325801e-07,
      "loss": 0.0013,
      "num_tokens": 6257054.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 4990
    },
    {
      "completion_length": 12.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.7,
      "completions/max_terminated_length": 18.7,
      "completions/mean_length": 12.575,
      "completions/mean_terminated_length": 12.575,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3438316600192546,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006154041737318039,
      "kl": 1.1578357517719269,
      "learning_rate": 8.502824858757062e-07,
      "loss": 0.0012,
      "num_tokens": 6270365.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5000
    },
    {
      "completion_length": 12.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 12.45,
      "completions/mean_terminated_length": 12.45,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.3445193233392931,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0072873965837061405,
      "kl": 1.4725398778915406,
      "learning_rate": 8.408662900188325e-07,
      "loss": 0.0015,
      "num_tokens": 6281839.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5010
    },
    {
      "completion_length": 21.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 53.4,
      "completions/max_terminated_length": 53.4,
      "completions/mean_length": 21.55,
      "completions/mean_terminated_length": 21.55,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.3452069866593316,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009922947734594345,
      "kl": 1.309710693359375,
      "learning_rate": 8.314500941619586e-07,
      "loss": 0.0013,
      "num_tokens": 6295025.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5020
    },
    {
      "completion_length": 19.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 42.8,
      "completions/max_terminated_length": 42.8,
      "completions/mean_length": 19.225,
      "completions/mean_terminated_length": 19.225,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.3458946499793701,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00749584287405014,
      "kl": 1.1847262263298035,
      "learning_rate": 8.220338983050847e-07,
      "loss": 0.0012,
      "num_tokens": 6307442.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5030
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.3465823132994086,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004699906334280968,
      "kl": 1.1936538338661193,
      "learning_rate": 8.12617702448211e-07,
      "loss": 0.0012,
      "num_tokens": 6320480.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5040
    },
    {
      "completion_length": 27.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 75.3,
      "completions/max_terminated_length": 75.3,
      "completions/mean_length": 27.425,
      "completions/mean_terminated_length": 27.425,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.34726997661944714,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.004290629643946886,
      "kl": 1.093562251329422,
      "learning_rate": 8.032015065913372e-07,
      "loss": 0.0011,
      "num_tokens": 6332481.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5050
    },
    {
      "completion_length": 11.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.525,
      "completions/mean_terminated_length": 11.525,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.34795763993948564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031757554970681667,
      "kl": 1.2823837637901305,
      "learning_rate": 7.937853107344634e-07,
      "loss": 0.0013,
      "num_tokens": 6343942.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5060
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.34864530325952414,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005417166743427515,
      "kl": 1.0797785520553589,
      "learning_rate": 7.843691148775895e-07,
      "loss": 0.0011,
      "num_tokens": 6356732.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5070
    },
    {
      "completion_length": 18.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 41.7,
      "completions/max_terminated_length": 41.7,
      "completions/mean_length": 18.375,
      "completions/mean_terminated_length": 18.375,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.34933296657956264,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005895074922591448,
      "kl": 1.1690425515174865,
      "learning_rate": 7.749529190207157e-07,
      "loss": 0.0012,
      "num_tokens": 6366627.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5080
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.35002062989960114,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005460694897919893,
      "kl": 1.1566421210765838,
      "learning_rate": 7.655367231638419e-07,
      "loss": 0.0012,
      "num_tokens": 6378400.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5090
    },
    {
      "completion_length": 20.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.7,
      "completions/max_terminated_length": 49.7,
      "completions/mean_length": 20.2,
      "completions/mean_terminated_length": 20.2,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.35070829321963964,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.02746504172682762,
      "kl": 1.2156450688838958,
      "learning_rate": 7.56120527306968e-07,
      "loss": 0.0012,
      "num_tokens": 6389916.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5100
    },
    {
      "completion_length": 28.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 78.9,
      "completions/max_terminated_length": 78.9,
      "completions/mean_length": 28.125,
      "completions/mean_terminated_length": 28.125,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.3513959565396782,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002399625489488244,
      "kl": 1.255337220430374,
      "learning_rate": 7.467043314500942e-07,
      "loss": 0.0013,
      "num_tokens": 6403225.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5110
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.3520836198597167,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008576685562729836,
      "kl": 1.2706177592277528,
      "learning_rate": 7.372881355932204e-07,
      "loss": 0.0013,
      "num_tokens": 6413773.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5120
    },
    {
      "completion_length": 12.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3527712831797552,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004991778638213873,
      "kl": 1.2184751868247985,
      "learning_rate": 7.278719397363465e-07,
      "loss": 0.0012,
      "num_tokens": 6426346.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5130
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.3534589464997937,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005585167091339827,
      "kl": 1.2197207391262055,
      "learning_rate": 7.184557438794728e-07,
      "loss": 0.0012,
      "num_tokens": 6439211.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5140
    },
    {
      "completion_length": 40.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.6,
      "completions/max_terminated_length": 127.6,
      "completions/mean_length": 40.525,
      "completions/mean_terminated_length": 40.525,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.3541466098198322,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.007972132414579391,
      "kl": 1.2606733858585357,
      "learning_rate": 7.09039548022599e-07,
      "loss": 0.0013,
      "num_tokens": 6453252.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5150
    },
    {
      "completion_length": 11.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.075,
      "completions/mean_terminated_length": 11.075,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3548342731398707,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009181777015328407,
      "kl": 1.3541767716407775,
      "learning_rate": 6.996233521657251e-07,
      "loss": 0.0014,
      "num_tokens": 6464707.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5160
    },
    {
      "completion_length": 21.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 53.3,
      "completions/max_terminated_length": 53.3,
      "completions/mean_length": 21.575,
      "completions/mean_terminated_length": 21.575,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.3555219364599092,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005340268835425377,
      "kl": 1.1702833890914917,
      "learning_rate": 6.902071563088513e-07,
      "loss": 0.0012,
      "num_tokens": 6476706.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5170
    },
    {
      "completion_length": 11.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.4,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.35620959977994776,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00588555634021759,
      "kl": 1.3390805840492248,
      "learning_rate": 6.807909604519775e-07,
      "loss": 0.0013,
      "num_tokens": 6488342.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5180
    },
    {
      "completion_length": 11.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.225,
      "completions/mean_terminated_length": 11.225,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.35689726309998626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007043247576802969,
      "kl": 1.256647562980652,
      "learning_rate": 6.713747645951036e-07,
      "loss": 0.0013,
      "num_tokens": 6500363.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5190
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.35758492642002476,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005169827025383711,
      "kl": 1.0854641497135162,
      "learning_rate": 6.619585687382298e-07,
      "loss": 0.0011,
      "num_tokens": 6513667.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5200
    },
    {
      "completion_length": 13.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 24.6,
      "completions/max_terminated_length": 24.6,
      "completions/mean_length": 13.6,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.35827258974006326,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.02052394114434719,
      "kl": 1.5007223725318908,
      "learning_rate": 6.52542372881356e-07,
      "loss": 0.0015,
      "num_tokens": 6526107.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5210
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.35896025306010176,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.2504121959209442,
      "kl": 1.3355840384960174,
      "learning_rate": 6.431261770244822e-07,
      "loss": 0.0013,
      "num_tokens": 6538363.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5220
    },
    {
      "completion_length": 10.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 10.95,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.35964791638014026,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006526256911456585,
      "kl": 1.2021831452846528,
      "learning_rate": 6.337099811676084e-07,
      "loss": 0.0012,
      "num_tokens": 6551289.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5230
    },
    {
      "completion_length": 29.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 82.8,
      "completions/max_terminated_length": 82.8,
      "completions/mean_length": 29.4,
      "completions/mean_terminated_length": 29.4,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.36033557970017877,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.00306605058722198,
      "kl": 1.1230719089508057,
      "learning_rate": 6.242937853107346e-07,
      "loss": 0.0011,
      "num_tokens": 6563953.0,
      "reward": 5.9,
      "reward_std": 0.2,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5240
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.2,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.3610232430202173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005555626004934311,
      "kl": 1.14589102268219,
      "learning_rate": 6.148775894538607e-07,
      "loss": 0.0011,
      "num_tokens": 6576639.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5250
    },
    {
      "completion_length": 12.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 12.325,
      "completions/mean_terminated_length": 12.325,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.3617109063402558,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006481971126049757,
      "kl": 1.2317140579223633,
      "learning_rate": 6.054613935969868e-07,
      "loss": 0.0012,
      "num_tokens": 6589136.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5260
    },
    {
      "completion_length": 63.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 219.6,
      "completions/max_terminated_length": 219.6,
      "completions/mean_length": 63.85,
      "completions/mean_terminated_length": 63.85,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.3623985696602943,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 1.8743151426315308,
      "kl": 1.0735913693904877,
      "learning_rate": 5.960451977401131e-07,
      "loss": 0.0011,
      "num_tokens": 6602826.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5270
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.3630862329803328,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00245496304705739,
      "kl": 1.2225584924221038,
      "learning_rate": 5.866290018832392e-07,
      "loss": 0.0012,
      "num_tokens": 6615731.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5280
    },
    {
      "completion_length": 14.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 26.3,
      "completions/max_terminated_length": 26.3,
      "completions/mean_length": 14.025,
      "completions/mean_terminated_length": 14.025,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.3637738963003713,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 22.05070686340332,
      "kl": 1.3892334163188935,
      "learning_rate": 5.772128060263654e-07,
      "loss": 0.0014,
      "num_tokens": 6627700.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5290
    },
    {
      "completion_length": 12.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 12.275,
      "completions/mean_terminated_length": 12.275,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.3644615596204098,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0061513264663517475,
      "kl": 1.1569065511226655,
      "learning_rate": 5.677966101694916e-07,
      "loss": 0.0012,
      "num_tokens": 6641163.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5300
    },
    {
      "completion_length": 12.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.6,
      "completions/max_terminated_length": 17.6,
      "completions/mean_length": 12.275,
      "completions/mean_terminated_length": 12.275,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.3651492229404484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004866019356995821,
      "kl": 1.1956783890724183,
      "learning_rate": 5.583804143126178e-07,
      "loss": 0.0012,
      "num_tokens": 6653154.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5310
    },
    {
      "completion_length": 19.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 46.2,
      "completions/max_terminated_length": 46.2,
      "completions/mean_length": 19.55,
      "completions/mean_terminated_length": 19.55,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3658368862604869,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0047823116183280945,
      "kl": 1.1544413030147553,
      "learning_rate": 5.489642184557439e-07,
      "loss": 0.0012,
      "num_tokens": 6665856.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5320
    },
    {
      "completion_length": 16.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 32.2,
      "completions/max_terminated_length": 32.2,
      "completions/mean_length": 16.125,
      "completions/mean_terminated_length": 16.125,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.3665245495805254,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002742278855293989,
      "kl": 1.186799842119217,
      "learning_rate": 5.395480225988701e-07,
      "loss": 0.0012,
      "num_tokens": 6678137.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5330
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.3672122129005639,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008778350427746773,
      "kl": 1.0832504153251648,
      "learning_rate": 5.301318267419963e-07,
      "loss": 0.0011,
      "num_tokens": 6690458.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5340
    },
    {
      "completion_length": 15.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.8,
      "completions/max_terminated_length": 31.8,
      "completions/mean_length": 15.9,
      "completions/mean_terminated_length": 15.9,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3678998762206024,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004558028653264046,
      "kl": 1.1966825664043426,
      "learning_rate": 5.207156308851224e-07,
      "loss": 0.0012,
      "num_tokens": 6704106.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5350
    },
    {
      "completion_length": 12.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.55,
      "completions/mean_terminated_length": 12.55,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.3685875395406409,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004179758485406637,
      "kl": 1.0407968640327454,
      "learning_rate": 5.112994350282487e-07,
      "loss": 0.001,
      "num_tokens": 6716772.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5360
    },
    {
      "completion_length": 16.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 35.9,
      "completions/max_terminated_length": 35.9,
      "completions/mean_length": 16.3,
      "completions/mean_terminated_length": 16.3,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3692752028606794,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 3.5104458332061768,
      "kl": 1.2622505128383636,
      "learning_rate": 5.018832391713748e-07,
      "loss": 0.0013,
      "num_tokens": 6731212.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5370
    },
    {
      "completion_length": 13.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 24.5,
      "completions/max_terminated_length": 24.5,
      "completions/mean_length": 13.9,
      "completions/mean_terminated_length": 13.9,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.36996286618071794,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0022497333120554686,
      "kl": 1.1641826272010802,
      "learning_rate": 4.924670433145009e-07,
      "loss": 0.0012,
      "num_tokens": 6742524.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5380
    },
    {
      "completion_length": 12.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.7,
      "completions/max_terminated_length": 18.7,
      "completions/mean_length": 12.4,
      "completions/mean_terminated_length": 12.4,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.37065052950075644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036960735451430082,
      "kl": 1.3138091266155243,
      "learning_rate": 4.830508474576272e-07,
      "loss": 0.0013,
      "num_tokens": 6755684.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5390
    },
    {
      "completion_length": 12.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 12.8,
      "completions/mean_terminated_length": 12.8,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.37133819282079494,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008950471878051758,
      "kl": 1.09179944396019,
      "learning_rate": 4.736346516007533e-07,
      "loss": 0.0011,
      "num_tokens": 6769472.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5400
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.8,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.37202585614083344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00552986329421401,
      "kl": 1.1067481756210327,
      "learning_rate": 4.6421845574387955e-07,
      "loss": 0.0011,
      "num_tokens": 6781257.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5410
    },
    {
      "completion_length": 25.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 68.4,
      "completions/max_terminated_length": 68.4,
      "completions/mean_length": 25.55,
      "completions/mean_terminated_length": 25.55,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.37271351946087194,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005075867287814617,
      "kl": 1.245941936969757,
      "learning_rate": 4.5480225988700566e-07,
      "loss": 0.0012,
      "num_tokens": 6794011.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5420
    },
    {
      "completion_length": 12.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 12.2,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.37340118278091045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015166271477937698,
      "kl": 1.3145410895347596,
      "learning_rate": 4.4538606403013183e-07,
      "loss": 0.0013,
      "num_tokens": 6804963.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5430
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.374088846100949,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006140697747468948,
      "kl": 1.1052743911743164,
      "learning_rate": 4.3596986817325805e-07,
      "loss": 0.0011,
      "num_tokens": 6817578.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5440
    },
    {
      "completion_length": 11.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.975,
      "completions/mean_terminated_length": 11.975,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.3747765094209875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011798047460615635,
      "kl": 1.282853376865387,
      "learning_rate": 4.265536723163842e-07,
      "loss": 0.0013,
      "num_tokens": 6827773.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5450
    },
    {
      "completion_length": 12.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 12.475,
      "completions/mean_terminated_length": 12.475,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.375464172741026,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004385827574878931,
      "kl": 1.091407561302185,
      "learning_rate": 4.1713747645951044e-07,
      "loss": 0.0011,
      "num_tokens": 6840336.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5460
    },
    {
      "completion_length": 12.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.3,
      "completions/max_terminated_length": 18.3,
      "completions/mean_length": 12.9,
      "completions/mean_terminated_length": 12.9,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.3761518360610645,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004174704663455486,
      "kl": 1.1986376702785493,
      "learning_rate": 4.0772128060263656e-07,
      "loss": 0.0012,
      "num_tokens": 6851460.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5470
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.7,
      "completions/max_terminated_length": 20.7,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.376839499381103,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00416609225794673,
      "kl": 1.216666615009308,
      "learning_rate": 3.983050847457627e-07,
      "loss": 0.0012,
      "num_tokens": 6862858.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5480
    },
    {
      "completion_length": 10.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 10.725,
      "completions/mean_terminated_length": 10.725,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3775271627011415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008793276734650135,
      "kl": 1.2551342070102691,
      "learning_rate": 3.8888888888888895e-07,
      "loss": 0.0013,
      "num_tokens": 6873611.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5490
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.37821482602118,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004354639444500208,
      "kl": 1.259047031402588,
      "learning_rate": 3.7947269303201506e-07,
      "loss": 0.0013,
      "num_tokens": 6885284.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5500
    },
    {
      "completion_length": 23.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 60.3,
      "completions/max_terminated_length": 60.3,
      "completions/mean_length": 23.25,
      "completions/mean_terminated_length": 23.25,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.37890248934121856,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003756628604605794,
      "kl": 1.2296605169773103,
      "learning_rate": 3.700564971751413e-07,
      "loss": 0.0012,
      "num_tokens": 6898330.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5510
    },
    {
      "completion_length": 13.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.1,
      "completions/max_terminated_length": 20.1,
      "completions/mean_length": 13.475,
      "completions/mean_terminated_length": 13.475,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.37959015266125706,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011304677464067936,
      "kl": 1.1572206735610961,
      "learning_rate": 3.6064030131826745e-07,
      "loss": 0.0012,
      "num_tokens": 6909365.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5520
    },
    {
      "completion_length": 12.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 12.85,
      "completions/mean_terminated_length": 12.85,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.38027781598129556,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032178533729165792,
      "kl": 1.1290329098701477,
      "learning_rate": 3.512241054613937e-07,
      "loss": 0.0011,
      "num_tokens": 6921611.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5530
    },
    {
      "completion_length": 12.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.7,
      "completions/max_terminated_length": 18.7,
      "completions/mean_length": 12.95,
      "completions/mean_terminated_length": 12.95,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.38096547930133406,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00934622623026371,
      "kl": 1.1119795739650726,
      "learning_rate": 3.418079096045198e-07,
      "loss": 0.0011,
      "num_tokens": 6933669.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5540
    },
    {
      "completion_length": 12.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.7,
      "completions/max_terminated_length": 18.7,
      "completions/mean_length": 12.4,
      "completions/mean_terminated_length": 12.4,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.38165314262137257,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007171685341745615,
      "kl": 1.3122216761112213,
      "learning_rate": 3.3239171374764596e-07,
      "loss": 0.0013,
      "num_tokens": 6945577.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5550
    },
    {
      "completion_length": 42.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 135.7,
      "completions/max_terminated_length": 135.7,
      "completions/mean_length": 42.975,
      "completions/mean_terminated_length": 42.975,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.38234080594141107,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.021711384877562523,
      "kl": 1.1385715395212173,
      "learning_rate": 3.229755178907722e-07,
      "loss": 0.0011,
      "num_tokens": 6958564.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.1,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5560
    },
    {
      "completion_length": 40.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 127.9,
      "completions/max_terminated_length": 127.9,
      "completions/mean_length": 40.675,
      "completions/mean_terminated_length": 40.675,
      "completions/min_length": 10.5,
      "completions/min_terminated_length": 10.5,
      "epoch": 0.38302846926144957,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.0039871432818472385,
      "kl": 1.0618961095809936,
      "learning_rate": 3.135593220338983e-07,
      "loss": 0.0011,
      "num_tokens": 6971879.0,
      "reward": 5.9,
      "reward_std": 0.2,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5570
    },
    {
      "completion_length": 12.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 12.3,
      "completions/mean_terminated_length": 12.3,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3837161325814881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004354615230113268,
      "kl": 1.0903443574905396,
      "learning_rate": 3.041431261770245e-07,
      "loss": 0.0011,
      "num_tokens": 6982067.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5580
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.3844037959015266,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006406493950635195,
      "kl": 1.113939094543457,
      "learning_rate": 2.947269303201507e-07,
      "loss": 0.0011,
      "num_tokens": 6994550.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5590
    },
    {
      "completion_length": 12.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.1,
      "completions/max_terminated_length": 17.1,
      "completions/mean_length": 12.4,
      "completions/mean_terminated_length": 12.4,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.3850914592215651,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0028774929232895374,
      "kl": 1.1482871413230895,
      "learning_rate": 2.8531073446327686e-07,
      "loss": 0.0011,
      "num_tokens": 7007206.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5600
    },
    {
      "completion_length": 25.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 68.3,
      "completions/max_terminated_length": 68.3,
      "completions/mean_length": 25.925,
      "completions/mean_terminated_length": 25.925,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.3857791225416036,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.004740417003631592,
      "kl": 1.0405526280403137,
      "learning_rate": 2.75894538606403e-07,
      "loss": 0.001,
      "num_tokens": 7019915.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5610
    },
    {
      "completion_length": 17.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 39.0,
      "completions/max_terminated_length": 39.0,
      "completions/mean_length": 17.9,
      "completions/mean_terminated_length": 17.9,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.3864667858616421,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.014549007639288902,
      "kl": 1.0921140372753144,
      "learning_rate": 2.6647834274952925e-07,
      "loss": 0.0011,
      "num_tokens": 7034123.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5620
    },
    {
      "completion_length": 25.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 67.2,
      "completions/max_terminated_length": 67.2,
      "completions/mean_length": 25.325,
      "completions/mean_terminated_length": 25.325,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.3871544491816806,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.002857876941561699,
      "kl": 1.0279980540275573,
      "learning_rate": 2.5706214689265536e-07,
      "loss": 0.001,
      "num_tokens": 7048884.0,
      "reward": 5.9,
      "reward_std": 0.2,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5630
    },
    {
      "completion_length": 22.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 59.2,
      "completions/max_terminated_length": 59.2,
      "completions/mean_length": 22.8,
      "completions/mean_terminated_length": 22.8,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3878421125017192,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005176835227757692,
      "kl": 1.3074536800384522,
      "learning_rate": 2.4764595103578153e-07,
      "loss": 0.0013,
      "num_tokens": 7061932.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5640
    },
    {
      "completion_length": 11.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.6,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.3885297758217577,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012251128442585468,
      "kl": 1.2243261456489563,
      "learning_rate": 2.3822975517890773e-07,
      "loss": 0.0012,
      "num_tokens": 7074416.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5650
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.3892174391417962,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006726069375872612,
      "kl": 2.5275397300720215,
      "learning_rate": 2.2881355932203392e-07,
      "loss": 0.0025,
      "num_tokens": 7087462.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5660
    },
    {
      "completion_length": 12.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 12.1,
      "completions/mean_terminated_length": 12.1,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.3899051024618347,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004947757348418236,
      "kl": 1.1592617809772492,
      "learning_rate": 2.193973634651601e-07,
      "loss": 0.0012,
      "num_tokens": 7100302.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5670
    },
    {
      "completion_length": 14.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.6,
      "completions/max_terminated_length": 22.6,
      "completions/mean_length": 14.325,
      "completions/mean_terminated_length": 14.325,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.3905927657818732,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.03539891913533211,
      "kl": 1.1650193750858306,
      "learning_rate": 2.0998116760828628e-07,
      "loss": 0.0012,
      "num_tokens": 7112795.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5680
    },
    {
      "completion_length": 19.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 45.7,
      "completions/max_terminated_length": 45.7,
      "completions/mean_length": 19.675,
      "completions/mean_terminated_length": 19.675,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3912804291019117,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 1.8668094873428345,
      "kl": 1.1627625286579133,
      "learning_rate": 2.0056497175141243e-07,
      "loss": 0.0012,
      "num_tokens": 7125062.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5690
    },
    {
      "completion_length": 11.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.5,
      "completions/mean_terminated_length": 11.5,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3919680924219502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00394619582220912,
      "kl": 1.174496626853943,
      "learning_rate": 1.9114877589453862e-07,
      "loss": 0.0012,
      "num_tokens": 7136598.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5700
    },
    {
      "completion_length": 12.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.39265575574198874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035702527966350317,
      "kl": 1.0594860434532165,
      "learning_rate": 1.817325800376648e-07,
      "loss": 0.0011,
      "num_tokens": 7148570.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5710
    },
    {
      "completion_length": 11.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.025,
      "completions/mean_terminated_length": 11.025,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.39334341906202724,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005145955365151167,
      "kl": 1.3463842570781708,
      "learning_rate": 1.7231638418079099e-07,
      "loss": 0.0013,
      "num_tokens": 7159799.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5720
    },
    {
      "completion_length": 13.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.1,
      "completions/max_terminated_length": 18.1,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.39403108238206574,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.027580684050917625,
      "kl": 1.1848535418510437,
      "learning_rate": 1.6290018832391715e-07,
      "loss": 0.0012,
      "num_tokens": 7173311.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5730
    },
    {
      "completion_length": 13.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.0,
      "completions/max_terminated_length": 21.0,
      "completions/mean_length": 13.3,
      "completions/mean_terminated_length": 13.3,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.39471874570210425,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00894416868686676,
      "kl": 1.2177072703838348,
      "learning_rate": 1.5348399246704332e-07,
      "loss": 0.0012,
      "num_tokens": 7185047.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5740
    },
    {
      "completion_length": 29.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 53.7,
      "completions/max_terminated_length": 53.7,
      "completions/mean_length": 29.75,
      "completions/mean_terminated_length": 29.75,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.39540640902214275,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009776029735803604,
      "kl": 0.933041188120842,
      "learning_rate": 1.440677966101695e-07,
      "loss": 0.0009,
      "num_tokens": 7198301.0,
      "reward": 5.9,
      "reward_std": 0.1154700517654419,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.05773502588272095,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.028867512941360474,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.028867512941360474,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5750
    },
    {
      "completion_length": 29.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 83.1,
      "completions/max_terminated_length": 83.1,
      "completions/mean_length": 29.575,
      "completions/mean_terminated_length": 29.575,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.39609407234218125,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004383026156574488,
      "kl": 1.0633900821208955,
      "learning_rate": 1.346516007532957e-07,
      "loss": 0.0011,
      "num_tokens": 7212852.0,
      "reward": 5.925,
      "reward_std": 0.09574271440505981,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.05773502588272095,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5760
    },
    {
      "completion_length": 13.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.5,
      "completions/max_terminated_length": 18.5,
      "completions/mean_length": 13.475,
      "completions/mean_terminated_length": 13.475,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.3967817356622198,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.026582496240735054,
      "kl": 0.9946746349334716,
      "learning_rate": 1.2523540489642186e-07,
      "loss": 0.001,
      "num_tokens": 7225491.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5770
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3974693989822583,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006764892488718033,
      "kl": 1.2381967782974244,
      "learning_rate": 1.1581920903954804e-07,
      "loss": 0.0012,
      "num_tokens": 7238005.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5780
    },
    {
      "completion_length": 12.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.475,
      "completions/mean_terminated_length": 12.475,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.3981570623022968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009114019572734833,
      "kl": 1.2961783528327941,
      "learning_rate": 1.0640301318267422e-07,
      "loss": 0.0013,
      "num_tokens": 7249244.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5790
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.2,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.3988447256223353,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004210586193948984,
      "kl": 1.234499990940094,
      "learning_rate": 9.698681732580038e-08,
      "loss": 0.0012,
      "num_tokens": 7261796.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5800
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.3995323889423738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005776832811534405,
      "kl": 1.5726662933826447,
      "learning_rate": 8.757062146892656e-08,
      "loss": 0.0016,
      "num_tokens": 7272721.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5810
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.4002200522624123,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009506648406386375,
      "kl": 1.2478809118270875,
      "learning_rate": 7.815442561205274e-08,
      "loss": 0.0012,
      "num_tokens": 7282155.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5820
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.4009077155824508,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004437907133251429,
      "kl": 1.7026907682418824,
      "learning_rate": 6.873822975517891e-08,
      "loss": 0.0017,
      "num_tokens": 7294895.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5830
    },
    {
      "completion_length": 11.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.40159537890248936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00963292270898819,
      "kl": 1.1885063588619231,
      "learning_rate": 5.932203389830509e-08,
      "loss": 0.0012,
      "num_tokens": 7304270.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5840
    },
    {
      "completion_length": 13.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.2,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 13.025,
      "completions/mean_terminated_length": 13.025,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.40228304222252786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034977158065885305,
      "kl": 1.1110076546669005,
      "learning_rate": 4.9905838041431265e-08,
      "loss": 0.0011,
      "num_tokens": 7315871.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5850
    },
    {
      "completion_length": 30.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 90.9,
      "completions/max_terminated_length": 90.9,
      "completions/mean_length": 30.975,
      "completions/mean_terminated_length": 30.975,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.40297070554256637,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.00785754807293415,
      "kl": 1.146417075395584,
      "learning_rate": 4.048964218455744e-08,
      "loss": 0.0011,
      "num_tokens": 7329542.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5860
    },
    {
      "completion_length": 56.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 189.6,
      "completions/max_terminated_length": 189.6,
      "completions/mean_length": 56.375,
      "completions/mean_terminated_length": 56.375,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.40365836886260487,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 1.7976064682006836,
      "kl": 1.1094903528690339,
      "learning_rate": 3.107344632768362e-08,
      "loss": 0.0011,
      "num_tokens": 7343725.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.9625,
      "rewards/match_format_approximately/std": 0.075,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5870
    },
    {
      "completion_length": 12.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 12.7,
      "completions/mean_terminated_length": 12.7,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.40434603218264337,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006050520576536655,
      "kl": 1.1388359487056732,
      "learning_rate": 2.1657250470809794e-08,
      "loss": 0.0011,
      "num_tokens": 7356937.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5880
    },
    {
      "completion_length": 16.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 32.0,
      "completions/max_terminated_length": 32.0,
      "completions/mean_length": 16.025,
      "completions/mean_terminated_length": 16.025,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.40503369550268187,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 4.21384334564209,
      "kl": 1.2012366831302643,
      "learning_rate": 1.2241054613935971e-08,
      "loss": 0.0012,
      "num_tokens": 7370634.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5890
    },
    {
      "completion_length": 12.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.7,
      "completions/max_terminated_length": 17.7,
      "completions/mean_length": 12.95,
      "completions/mean_terminated_length": 12.95,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.4057213588227204,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032482228707522154,
      "kl": 1.1373628199100494,
      "learning_rate": 2.8248587570621472e-09,
      "loss": 0.0011,
      "num_tokens": 7382464.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5900
    },
    {
      "completion_length": 15.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.8,
      "completions/max_terminated_length": 31.8,
      "completions/mean_length": 15.8,
      "completions/mean_terminated_length": 15.8,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.4064090221427589,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007901502773165703,
      "kl": 1.0803104102611543,
      "learning_rate": 2.9302222222222227e-06,
      "loss": 0.0011,
      "num_tokens": 7394784.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5910
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.4070966854627974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0035292813554406166,
      "kl": 1.12642440199852,
      "learning_rate": 2.925777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 7406184.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5920
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.4077843487828359,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023179894778877497,
      "kl": 1.2278540432453156,
      "learning_rate": 2.9213333333333337e-06,
      "loss": 0.0012,
      "num_tokens": 7419590.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5930
    },
    {
      "completion_length": 19.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.9,
      "completions/max_terminated_length": 49.9,
      "completions/mean_length": 19.65,
      "completions/mean_terminated_length": 19.65,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.4084720121028744,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005237962584942579,
      "kl": 1.34213387966156,
      "learning_rate": 2.916888888888889e-06,
      "loss": 0.0013,
      "num_tokens": 7430864.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5940
    },
    {
      "completion_length": 12.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 12.275,
      "completions/mean_terminated_length": 12.275,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.4091596754229129,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.05530301854014397,
      "kl": 1.1015527904033662,
      "learning_rate": 2.9124444444444442e-06,
      "loss": 0.0011,
      "num_tokens": 7443395.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5950
    },
    {
      "completion_length": 13.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.6,
      "completions/max_terminated_length": 19.6,
      "completions/mean_length": 13.675,
      "completions/mean_terminated_length": 13.675,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.40984733874295143,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0057378592900931835,
      "kl": 1.3134727358818055,
      "learning_rate": 2.9080000000000004e-06,
      "loss": 0.0013,
      "num_tokens": 7457046.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5960
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.41053500206299,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.09183824807405472,
      "kl": 1.3441233158111572,
      "learning_rate": 2.9035555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 7468995.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5970
    },
    {
      "completion_length": 11.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.225,
      "completions/mean_terminated_length": 11.225,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.4112226653830285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00760465394705534,
      "kl": 1.5161000728607177,
      "learning_rate": 2.8991111111111113e-06,
      "loss": 0.0015,
      "num_tokens": 7481972.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5980
    },
    {
      "completion_length": 14.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.8,
      "completions/max_terminated_length": 22.8,
      "completions/mean_length": 14.25,
      "completions/mean_terminated_length": 14.25,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.411910328703067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004714191425591707,
      "kl": 1.405648809671402,
      "learning_rate": 2.8946666666666666e-06,
      "loss": 0.0014,
      "num_tokens": 7494194.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 5990
    },
    {
      "completion_length": 17.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 39.2,
      "completions/max_terminated_length": 39.2,
      "completions/mean_length": 17.175,
      "completions/mean_terminated_length": 17.175,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.4125979920231055,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 2.1586594581604004,
      "kl": 1.4109269559383393,
      "learning_rate": 2.8902222222222227e-06,
      "loss": 0.0014,
      "num_tokens": 7505437.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6000
    },
    {
      "completion_length": 24.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 68.7,
      "completions/max_terminated_length": 68.7,
      "completions/mean_length": 24.45,
      "completions/mean_terminated_length": 24.45,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.413285655343144,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.041129227727651596,
      "kl": 1.3110809564590453,
      "learning_rate": 2.885777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 7518291.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6010
    },
    {
      "completion_length": 11.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.525,
      "completions/mean_terminated_length": 11.525,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.4139733186631825,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004482824355363846,
      "kl": 1.3215213119983673,
      "learning_rate": 2.8813333333333337e-06,
      "loss": 0.0013,
      "num_tokens": 7530908.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6020
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 7.1,
      "completions/min_terminated_length": 7.1,
      "epoch": 0.414660981983221,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004378788638859987,
      "kl": 1.6575611710548401,
      "learning_rate": 2.876888888888889e-06,
      "loss": 0.0017,
      "num_tokens": 7542956.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6030
    },
    {
      "completion_length": 9.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.3,
      "completions/max_terminated_length": 12.3,
      "completions/mean_length": 9.875,
      "completions/mean_terminated_length": 9.875,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.41534864530325954,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004252531565725803,
      "kl": 1.6062816500663757,
      "learning_rate": 2.872444444444445e-06,
      "loss": 0.0016,
      "num_tokens": 7555015.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6040
    },
    {
      "completion_length": 9.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 11.3,
      "completions/max_terminated_length": 11.3,
      "completions/mean_length": 9.15,
      "completions/mean_terminated_length": 9.15,
      "completions/min_length": 7.1,
      "completions/min_terminated_length": 7.1,
      "epoch": 0.41603630862329805,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008693459443747997,
      "kl": 1.7664753794670105,
      "learning_rate": 2.8680000000000003e-06,
      "loss": 0.0018,
      "num_tokens": 7567833.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6050
    },
    {
      "completion_length": 22.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 61.1,
      "completions/max_terminated_length": 61.1,
      "completions/mean_length": 22.175,
      "completions/mean_terminated_length": 22.175,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.41672397194333655,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.025784730911254883,
      "kl": 1.5375929474830627,
      "learning_rate": 2.8635555555555556e-06,
      "loss": 0.0015,
      "num_tokens": 7580336.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6060
    },
    {
      "completion_length": 10.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 10.9,
      "completions/mean_terminated_length": 10.9,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.41741163526337505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.023190055042505264,
      "kl": 1.4921037673950195,
      "learning_rate": 2.8591111111111113e-06,
      "loss": 0.0015,
      "num_tokens": 7591768.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6070
    },
    {
      "completion_length": 9.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.8,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 9.75,
      "completions/mean_terminated_length": 9.75,
      "completions/min_length": 7.4,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.41809929858341355,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01030003372579813,
      "kl": 1.9349348664283752,
      "learning_rate": 2.8546666666666666e-06,
      "loss": 0.0019,
      "num_tokens": 7603882.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6080
    },
    {
      "completion_length": 10.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 10.95,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.41878696190345205,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006510532461106777,
      "kl": 1.429217952489853,
      "learning_rate": 2.8502222222222227e-06,
      "loss": 0.0014,
      "num_tokens": 7617252.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6090
    },
    {
      "completion_length": 10.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.9,
      "completions/max_terminated_length": 13.9,
      "completions/mean_length": 10.875,
      "completions/mean_terminated_length": 10.875,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.4194746252234906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005413992330431938,
      "kl": 1.5536803781986237,
      "learning_rate": 2.845777777777778e-06,
      "loss": 0.0016,
      "num_tokens": 7629711.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6100
    },
    {
      "completion_length": 10.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.7,
      "completions/mean_terminated_length": 10.7,
      "completions/min_length": 7.8,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.4201622885435291,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004487996455281973,
      "kl": 1.3762087106704712,
      "learning_rate": 2.8413333333333336e-06,
      "loss": 0.0014,
      "num_tokens": 7642731.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6110
    },
    {
      "completion_length": 9.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.3,
      "completions/max_terminated_length": 12.3,
      "completions/mean_length": 9.45,
      "completions/mean_terminated_length": 9.45,
      "completions/min_length": 7.5,
      "completions/min_terminated_length": 7.5,
      "epoch": 0.4208499518635676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.016771214082837105,
      "kl": 1.5483428835868835,
      "learning_rate": 2.836888888888889e-06,
      "loss": 0.0015,
      "num_tokens": 7655069.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6120
    },
    {
      "completion_length": 10.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 10.6,
      "completions/mean_terminated_length": 10.6,
      "completions/min_length": 6.9,
      "completions/min_terminated_length": 6.9,
      "epoch": 0.4215376151836061,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007195493672043085,
      "kl": 1.6185973286628723,
      "learning_rate": 2.832444444444445e-06,
      "loss": 0.0016,
      "num_tokens": 7667193.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6130
    },
    {
      "completion_length": 10.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 10.5,
      "completions/mean_terminated_length": 10.5,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.4222252785036446,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00659464905038476,
      "kl": 1.662790560722351,
      "learning_rate": 2.8280000000000003e-06,
      "loss": 0.0017,
      "num_tokens": 7679765.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6140
    },
    {
      "completion_length": 10.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.7,
      "completions/max_terminated_length": 12.7,
      "completions/mean_length": 10.425,
      "completions/mean_terminated_length": 10.425,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.4229129418236831,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015074046328663826,
      "kl": 1.298115348815918,
      "learning_rate": 2.823555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 7690646.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6150
    },
    {
      "completion_length": 9.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 9.8,
      "completions/mean_terminated_length": 9.8,
      "completions/min_length": 6.9,
      "completions/min_terminated_length": 6.9,
      "epoch": 0.4236006051437216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.03453448787331581,
      "kl": 1.4956003904342652,
      "learning_rate": 2.8191111111111112e-06,
      "loss": 0.0015,
      "num_tokens": 7701978.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6160
    },
    {
      "completion_length": 10.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 10.8,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.42428826846376017,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02794322744011879,
      "kl": 1.452487486600876,
      "learning_rate": 2.8146666666666665e-06,
      "loss": 0.0015,
      "num_tokens": 7715074.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6170
    },
    {
      "completion_length": 10.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.5,
      "completions/max_terminated_length": 12.5,
      "completions/mean_length": 10.05,
      "completions/mean_terminated_length": 10.05,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.42497593178379867,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011381459422409534,
      "kl": 1.4053172051906586,
      "learning_rate": 2.8102222222222226e-06,
      "loss": 0.0014,
      "num_tokens": 7726628.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6180
    },
    {
      "completion_length": 10.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 10.925,
      "completions/mean_terminated_length": 10.925,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.42566359510383717,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003446686314418912,
      "kl": 1.3329946517944335,
      "learning_rate": 2.805777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 7739049.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6190
    },
    {
      "completion_length": 11.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.125,
      "completions/mean_terminated_length": 11.125,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.42635125842387567,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.018087368458509445,
      "kl": 1.4069275319576264,
      "learning_rate": 2.8013333333333336e-06,
      "loss": 0.0014,
      "num_tokens": 7750026.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6200
    },
    {
      "completion_length": 9.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 9.95,
      "completions/mean_terminated_length": 9.95,
      "completions/min_length": 7.1,
      "completions/min_terminated_length": 7.1,
      "epoch": 0.42703892174391417,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006026132497936487,
      "kl": 1.6816380977630616,
      "learning_rate": 2.796888888888889e-06,
      "loss": 0.0017,
      "num_tokens": 7761324.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6210
    },
    {
      "completion_length": 12.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 12.05,
      "completions/mean_terminated_length": 12.05,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.42772658506395267,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004558009561151266,
      "kl": 1.3497009932994843,
      "learning_rate": 2.792444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 7775018.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6220
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 7.9,
      "completions/min_terminated_length": 7.9,
      "epoch": 0.4284142483839912,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003908331040292978,
      "kl": 1.3773386299610137,
      "learning_rate": 2.7880000000000002e-06,
      "loss": 0.0014,
      "num_tokens": 7785839.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6230
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.4291019117040297,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0031500409822911024,
      "kl": 1.2545689225196839,
      "learning_rate": 2.783555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 7797700.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6240
    },
    {
      "completion_length": 11.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.35,
      "completions/mean_terminated_length": 11.35,
      "completions/min_length": 7.7,
      "completions/min_terminated_length": 7.7,
      "epoch": 0.4297895750240682,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004434330854564905,
      "kl": 1.3473502099514008,
      "learning_rate": 2.779111111111111e-06,
      "loss": 0.0013,
      "num_tokens": 7810890.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6250
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.4304772383441067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038921276573091745,
      "kl": 1.2112794041633606,
      "learning_rate": 2.7746666666666665e-06,
      "loss": 0.0012,
      "num_tokens": 7821885.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6260
    },
    {
      "completion_length": 10.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.5,
      "completions/max_terminated_length": 13.5,
      "completions/mean_length": 10.9,
      "completions/mean_terminated_length": 10.9,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.43116490166414523,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.017070645466446877,
      "kl": 1.3707080781459808,
      "learning_rate": 2.7702222222222226e-06,
      "loss": 0.0014,
      "num_tokens": 7833037.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6270
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.43185256498418373,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0044752685353159904,
      "kl": 1.2199202120304107,
      "learning_rate": 2.765777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 7847013.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6280
    },
    {
      "completion_length": 11.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 11.25,
      "completions/mean_terminated_length": 11.25,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.43254022830422223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020100600086152554,
      "kl": 1.2238180756568908,
      "learning_rate": 2.7613333333333335e-06,
      "loss": 0.0012,
      "num_tokens": 7859531.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6290
    },
    {
      "completion_length": 34.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 68.6,
      "completions/max_terminated_length": 68.6,
      "completions/mean_length": 34.0,
      "completions/mean_terminated_length": 34.0,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.4332278916242608,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.2058627605438232,
      "kl": 1.1081747114658356,
      "learning_rate": 2.756888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 7873755.0,
      "reward": 5.925,
      "reward_std": 0.09574271440505981,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.028867512941360474,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.028867512941360474,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6300
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.4,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.4339155549442993,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003843477461487055,
      "kl": 1.2722107827663423,
      "learning_rate": 2.752444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 7884969.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6310
    },
    {
      "completion_length": 12.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 12.5,
      "completions/mean_terminated_length": 12.5,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.4346032182643378,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005162264686077833,
      "kl": 1.1211694359779358,
      "learning_rate": 2.748e-06,
      "loss": 0.0011,
      "num_tokens": 7897721.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6320
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.4352908815843763,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001960468478500843,
      "kl": 1.2898571014404296,
      "learning_rate": 2.743555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 7909910.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6330
    },
    {
      "completion_length": 13.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.1,
      "completions/max_terminated_length": 22.1,
      "completions/mean_length": 13.7,
      "completions/mean_terminated_length": 13.7,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.4359785449044148,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0032273821998387575,
      "kl": 1.246416300535202,
      "learning_rate": 2.739111111111111e-06,
      "loss": 0.0012,
      "num_tokens": 7922274.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6340
    },
    {
      "completion_length": 12.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 12.225,
      "completions/mean_terminated_length": 12.225,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.4366662082244533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00453966436907649,
      "kl": 1.3314736366271973,
      "learning_rate": 2.7346666666666673e-06,
      "loss": 0.0013,
      "num_tokens": 7933375.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6350
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.4373538715444918,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003801640821620822,
      "kl": 1.244855809211731,
      "learning_rate": 2.7302222222222225e-06,
      "loss": 0.0012,
      "num_tokens": 7944086.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6360
    },
    {
      "completion_length": 16.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 30.4,
      "completions/max_terminated_length": 30.4,
      "completions/mean_length": 16.425,
      "completions/mean_terminated_length": 16.425,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.43804153486453035,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0055861701257526875,
      "kl": 1.2101807296276093,
      "learning_rate": 2.725777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 7957839.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6370
    },
    {
      "completion_length": 12.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 12.2,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.43872919818456885,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032540559768676758,
      "kl": 1.1290572345256806,
      "learning_rate": 2.7213333333333335e-06,
      "loss": 0.0011,
      "num_tokens": 7969519.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6380
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.43941686150460735,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007687001954764128,
      "kl": 1.2707543969154358,
      "learning_rate": 2.7168888888888888e-06,
      "loss": 0.0013,
      "num_tokens": 7982380.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6390
    },
    {
      "completion_length": 11.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.275,
      "completions/mean_terminated_length": 11.275,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.44010452482464585,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003866757033392787,
      "kl": 1.3291922807693481,
      "learning_rate": 2.712444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 7994443.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6400
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.44079218814468435,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025106698740273714,
      "kl": 1.2189786791801454,
      "learning_rate": 2.708e-06,
      "loss": 0.0012,
      "num_tokens": 8005686.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6410
    },
    {
      "completion_length": 10.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 10.975,
      "completions/mean_terminated_length": 10.975,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.44147985146472285,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029226518236100674,
      "kl": 1.3308802485466003,
      "learning_rate": 2.703555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 8018109.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6420
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.4421675147847614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0057080830447375774,
      "kl": 1.3752940356731416,
      "learning_rate": 2.699111111111111e-06,
      "loss": 0.0014,
      "num_tokens": 8032504.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6430
    },
    {
      "completion_length": 13.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.7,
      "completions/max_terminated_length": 21.7,
      "completions/mean_length": 13.825,
      "completions/mean_terminated_length": 13.825,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.4428551781047999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004137388430535793,
      "kl": 1.1381323873996734,
      "learning_rate": 2.6946666666666672e-06,
      "loss": 0.0011,
      "num_tokens": 8044841.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6440
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.4435428414248384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004201957024633884,
      "kl": 1.29626407623291,
      "learning_rate": 2.6902222222222225e-06,
      "loss": 0.0013,
      "num_tokens": 8056430.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6450
    },
    {
      "completion_length": 12.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.1,
      "completions/max_terminated_length": 17.1,
      "completions/mean_length": 12.725,
      "completions/mean_terminated_length": 12.725,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.4442305047448769,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004895097576081753,
      "kl": 1.070450747013092,
      "learning_rate": 2.685777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 8068211.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6460
    },
    {
      "completion_length": 14.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.3,
      "completions/max_terminated_length": 22.3,
      "completions/mean_length": 14.125,
      "completions/mean_terminated_length": 14.125,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.4449181680649154,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007197881117463112,
      "kl": 1.2501220405101776,
      "learning_rate": 2.6813333333333335e-06,
      "loss": 0.0013,
      "num_tokens": 8080248.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6470
    },
    {
      "completion_length": 12.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 12.9,
      "completions/mean_terminated_length": 12.9,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.4456058313849539,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003812718205153942,
      "kl": 1.3214125633239746,
      "learning_rate": 2.6768888888888887e-06,
      "loss": 0.0013,
      "num_tokens": 8094288.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6480
    },
    {
      "completion_length": 11.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.75,
      "completions/mean_terminated_length": 11.75,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.4462934947049924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003874816931784153,
      "kl": 1.3365242898464202,
      "learning_rate": 2.672444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 8108670.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6490
    },
    {
      "completion_length": 13.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.5,
      "completions/max_terminated_length": 21.5,
      "completions/mean_length": 13.275,
      "completions/mean_terminated_length": 13.275,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.44698115802503097,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002750998130068183,
      "kl": 1.2364072024822235,
      "learning_rate": 2.668e-06,
      "loss": 0.0012,
      "num_tokens": 8123205.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6500
    },
    {
      "completion_length": 10.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 10.525,
      "completions/mean_terminated_length": 10.525,
      "completions/min_length": 7.6,
      "completions/min_terminated_length": 7.6,
      "epoch": 0.44766882134506947,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006352982483804226,
      "kl": 1.334709495306015,
      "learning_rate": 2.663555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 8136234.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6510
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.44835648466510797,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005306031089276075,
      "kl": 1.203692877292633,
      "learning_rate": 2.659111111111111e-06,
      "loss": 0.0012,
      "num_tokens": 8148841.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6520
    },
    {
      "completion_length": 10.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 10.475,
      "completions/mean_terminated_length": 10.475,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.44904414798514647,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006456505972892046,
      "kl": 1.4366894364356995,
      "learning_rate": 2.654666666666667e-06,
      "loss": 0.0014,
      "num_tokens": 8161592.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6530
    },
    {
      "completion_length": 10.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.9,
      "completions/max_terminated_length": 12.9,
      "completions/mean_length": 10.65,
      "completions/mean_terminated_length": 10.65,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.44973181130518497,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00415951618924737,
      "kl": 1.2912186980247498,
      "learning_rate": 2.6502222222222225e-06,
      "loss": 0.0013,
      "num_tokens": 8174226.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6540
    },
    {
      "completion_length": 12.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.7,
      "completions/max_terminated_length": 20.7,
      "completions/mean_length": 12.875,
      "completions/mean_terminated_length": 12.875,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.45041947462522347,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005188601557165384,
      "kl": 1.2176654636859894,
      "learning_rate": 2.645777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 8187109.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6550
    },
    {
      "completion_length": 11.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 11.025,
      "completions/mean_terminated_length": 11.025,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.451107137945262,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006083431653678417,
      "kl": 1.3448700129985809,
      "learning_rate": 2.6413333333333334e-06,
      "loss": 0.0013,
      "num_tokens": 8198434.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6560
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.4517948012653005,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003008403116837144,
      "kl": 1.1978749930858612,
      "learning_rate": 2.6368888888888887e-06,
      "loss": 0.0012,
      "num_tokens": 8209723.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6570
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.45248246458533903,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008322266861796379,
      "kl": 1.235140424966812,
      "learning_rate": 2.632444444444445e-06,
      "loss": 0.0012,
      "num_tokens": 8223700.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6580
    },
    {
      "completion_length": 11.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 11.225,
      "completions/mean_terminated_length": 11.225,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.45317012790537753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002982344478368759,
      "kl": 1.4228580594062805,
      "learning_rate": 2.628e-06,
      "loss": 0.0014,
      "num_tokens": 8234605.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6590
    },
    {
      "completion_length": 27.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 76.9,
      "completions/max_terminated_length": 76.9,
      "completions/mean_length": 27.75,
      "completions/mean_terminated_length": 27.75,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.45385779122541603,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 1.6949481964111328,
      "kl": 1.038349199295044,
      "learning_rate": 2.6235555555555558e-06,
      "loss": 0.001,
      "num_tokens": 8249379.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6600
    },
    {
      "completion_length": 17.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 39.5,
      "completions/max_terminated_length": 39.5,
      "completions/mean_length": 17.75,
      "completions/mean_terminated_length": 17.75,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.45454545454545453,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.01325819082558155,
      "kl": 1.1541422367095948,
      "learning_rate": 2.619111111111111e-06,
      "loss": 0.0012,
      "num_tokens": 8260369.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6610
    },
    {
      "completion_length": 12.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 12.35,
      "completions/mean_terminated_length": 12.35,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.45523311786549303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003837710013613105,
      "kl": 1.2479790687561034,
      "learning_rate": 2.614666666666667e-06,
      "loss": 0.0012,
      "num_tokens": 8271919.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6620
    },
    {
      "completion_length": 12.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.5,
      "completions/max_terminated_length": 17.5,
      "completions/mean_length": 12.4,
      "completions/mean_terminated_length": 12.4,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.4559207811855316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00758333969861269,
      "kl": 1.2035202145576478,
      "learning_rate": 2.6102222222222224e-06,
      "loss": 0.0012,
      "num_tokens": 8284087.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6630
    },
    {
      "completion_length": 11.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.4566084445055701,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032163290306925774,
      "kl": 1.258764785528183,
      "learning_rate": 2.605777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 8296478.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6640
    },
    {
      "completion_length": 20.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 47.7,
      "completions/max_terminated_length": 47.7,
      "completions/mean_length": 20.175,
      "completions/mean_terminated_length": 20.175,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.4572961078256086,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0021373762283474207,
      "kl": 1.3757518768310546,
      "learning_rate": 2.6017777777777782e-06,
      "loss": 0.0014,
      "num_tokens": 8308309.0,
      "reward": 5.95,
      "reward_std": 0.05773502588272095,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6650
    },
    {
      "completion_length": 23.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 59.1,
      "completions/max_terminated_length": 59.1,
      "completions/mean_length": 23.675,
      "completions/mean_terminated_length": 23.675,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.4579837711456471,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004941616673022509,
      "kl": 0.9016536772251129,
      "learning_rate": 2.5973333333333335e-06,
      "loss": 0.0009,
      "num_tokens": 8321792.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6660
    },
    {
      "completion_length": 22.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 58.4,
      "completions/max_terminated_length": 58.4,
      "completions/mean_length": 22.925,
      "completions/mean_terminated_length": 22.925,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.4586714344656856,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.014675181359052658,
      "kl": 1.3598309397697448,
      "learning_rate": 2.592888888888889e-06,
      "loss": 0.0014,
      "num_tokens": 8334013.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6670
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.0,
      "completions/max_terminated_length": 18.0,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.4593590977857241,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0958554819226265,
      "kl": 1.2881322383880616,
      "learning_rate": 2.5884444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 8345978.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6680
    },
    {
      "completion_length": 11.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 11.075,
      "completions/mean_terminated_length": 11.075,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.4600467611057626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007988972589373589,
      "kl": 1.558272522687912,
      "learning_rate": 2.5840000000000006e-06,
      "loss": 0.0016,
      "num_tokens": 8357717.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6690
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.46073442442580115,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023797177709639072,
      "kl": 1.2147677838802338,
      "learning_rate": 2.579555555555556e-06,
      "loss": 0.0012,
      "num_tokens": 8370428.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6700
    },
    {
      "completion_length": 11.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.225,
      "completions/mean_terminated_length": 11.225,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.46142208774583965,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004716573283076286,
      "kl": 1.1666797995567322,
      "learning_rate": 2.5751111111111115e-06,
      "loss": 0.0012,
      "num_tokens": 8382905.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6710
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.46210975106587815,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004184249322861433,
      "kl": 1.1673298239707948,
      "learning_rate": 2.570666666666667e-06,
      "loss": 0.0012,
      "num_tokens": 8395142.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6720
    },
    {
      "completion_length": 19.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 48.6,
      "completions/max_terminated_length": 48.6,
      "completions/mean_length": 19.5,
      "completions/mean_terminated_length": 19.5,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.46279741438591665,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006774708162993193,
      "kl": 1.487388950586319,
      "learning_rate": 2.566222222222222e-06,
      "loss": 0.0015,
      "num_tokens": 8407102.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6730
    },
    {
      "completion_length": 13.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.2,
      "completions/max_terminated_length": 21.2,
      "completions/mean_length": 13.8,
      "completions/mean_terminated_length": 13.8,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.46348507770595515,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002235209569334984,
      "kl": 1.0196092247962951,
      "learning_rate": 2.561777777777778e-06,
      "loss": 0.001,
      "num_tokens": 8420438.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6740
    },
    {
      "completion_length": 12.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.46417274102599365,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0041185058653354645,
      "kl": 1.2029572665691375,
      "learning_rate": 2.5573333333333335e-06,
      "loss": 0.0012,
      "num_tokens": 8433695.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6750
    },
    {
      "completion_length": 10.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.3,
      "completions/max_terminated_length": 13.3,
      "completions/mean_length": 10.825,
      "completions/mean_terminated_length": 10.825,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.4648604043460322,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.030052315443754196,
      "kl": 1.404645836353302,
      "learning_rate": 2.552888888888889e-06,
      "loss": 0.0014,
      "num_tokens": 8444880.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6760
    },
    {
      "completion_length": 12.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 12.35,
      "completions/mean_terminated_length": 12.35,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.4655480676660707,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051916250959038734,
      "kl": 1.1174580216407777,
      "learning_rate": 2.5484444444444444e-06,
      "loss": 0.0011,
      "num_tokens": 8456626.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6770
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.4662357309861092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005567606072872877,
      "kl": 1.062576812505722,
      "learning_rate": 2.5440000000000005e-06,
      "loss": 0.0011,
      "num_tokens": 8468635.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6780
    },
    {
      "completion_length": 23.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 60.1,
      "completions/max_terminated_length": 60.1,
      "completions/mean_length": 23.6,
      "completions/mean_terminated_length": 23.6,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.4669233943061477,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009036717936396599,
      "kl": 1.821528172492981,
      "learning_rate": 2.539555555555556e-06,
      "loss": 0.0018,
      "num_tokens": 8482275.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6790
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.4676110576261862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006006492301821709,
      "kl": 1.1518677949905396,
      "learning_rate": 2.5351111111111115e-06,
      "loss": 0.0012,
      "num_tokens": 8494032.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6800
    },
    {
      "completion_length": 12.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 12.5,
      "completions/mean_terminated_length": 12.5,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.4682987209462247,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030979763250797987,
      "kl": 1.0430574774742127,
      "learning_rate": 2.5306666666666668e-06,
      "loss": 0.001,
      "num_tokens": 8506708.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6810
    },
    {
      "completion_length": 16.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 33.8,
      "completions/max_terminated_length": 33.8,
      "completions/mean_length": 16.8,
      "completions/mean_terminated_length": 16.8,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.4689863842662632,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0050082216039299965,
      "kl": 1.1010195553302764,
      "learning_rate": 2.526222222222223e-06,
      "loss": 0.0011,
      "num_tokens": 8518388.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6820
    },
    {
      "completion_length": 13.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.9,
      "completions/max_terminated_length": 18.9,
      "completions/mean_length": 13.7,
      "completions/mean_terminated_length": 13.7,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.46967404758630177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013809357769787312,
      "kl": 0.9667928516864777,
      "learning_rate": 2.521777777777778e-06,
      "loss": 0.001,
      "num_tokens": 8529944.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6830
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.47036171090634027,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004135849419981241,
      "kl": 1.0926547050476074,
      "learning_rate": 2.5173333333333334e-06,
      "loss": 0.0011,
      "num_tokens": 8541106.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6840
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.47104937422637877,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004002850037068129,
      "kl": 1.0938444793224336,
      "learning_rate": 2.512888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 8552911.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6850
    },
    {
      "completion_length": 25.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 65.6,
      "completions/max_terminated_length": 65.6,
      "completions/mean_length": 25.525,
      "completions/mean_terminated_length": 25.525,
      "completions/min_length": 10.5,
      "completions/min_terminated_length": 10.5,
      "epoch": 0.47173703754641727,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.004127295222133398,
      "kl": 1.0458847165107727,
      "learning_rate": 2.5084444444444444e-06,
      "loss": 0.001,
      "num_tokens": 8565248.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.4875,
      "rewards/check_coherence/std": 0.025,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6860
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 10.4,
      "completions/min_terminated_length": 10.4,
      "epoch": 0.47242470086645577,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025481998454779387,
      "kl": 1.1385599613189696,
      "learning_rate": 2.5040000000000005e-06,
      "loss": 0.0011,
      "num_tokens": 8575918.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6870
    },
    {
      "completion_length": 13.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.6,
      "completions/max_terminated_length": 18.6,
      "completions/mean_length": 13.575,
      "completions/mean_terminated_length": 13.575,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.47311236418649427,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004273406229913235,
      "kl": 1.358880877494812,
      "learning_rate": 2.4995555555555558e-06,
      "loss": 0.0014,
      "num_tokens": 8590145.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6880
    },
    {
      "completion_length": 13.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.1,
      "completions/max_terminated_length": 20.1,
      "completions/mean_length": 13.275,
      "completions/mean_terminated_length": 13.275,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.47380002750653283,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004883030895143747,
      "kl": 1.3985342502593994,
      "learning_rate": 2.495111111111111e-06,
      "loss": 0.0014,
      "num_tokens": 8602292.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6890
    },
    {
      "completion_length": 16.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 30.3,
      "completions/max_terminated_length": 30.3,
      "completions/mean_length": 16.1,
      "completions/mean_terminated_length": 16.1,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.47448769082657133,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004624954890459776,
      "kl": 1.0976074278354644,
      "learning_rate": 2.4906666666666667e-06,
      "loss": 0.0011,
      "num_tokens": 8614088.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6900
    },
    {
      "completion_length": 12.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.4,
      "completions/mean_terminated_length": 12.4,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.47517535414660983,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.028318747878074646,
      "kl": 1.1434998154640197,
      "learning_rate": 2.4862222222222224e-06,
      "loss": 0.0011,
      "num_tokens": 8626912.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6910
    },
    {
      "completion_length": 13.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.4,
      "completions/max_terminated_length": 18.4,
      "completions/mean_length": 13.175,
      "completions/mean_terminated_length": 13.175,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.47586301746664833,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005463233217597008,
      "kl": 1.1536414802074433,
      "learning_rate": 2.481777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 8639415.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6920
    },
    {
      "completion_length": 17.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 37.1,
      "completions/max_terminated_length": 37.1,
      "completions/mean_length": 17.725,
      "completions/mean_terminated_length": 17.725,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.47655068078668683,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.004396663047373295,
      "kl": 1.224897998571396,
      "learning_rate": 2.4773333333333334e-06,
      "loss": 0.0012,
      "num_tokens": 8651924.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6930
    },
    {
      "completion_length": 11.426470588235293,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.61111111111111,
      "completions/max_terminated_length": 14.61111111111111,
      "completions/mean_length": 11.51388888888889,
      "completions/mean_terminated_length": 11.51388888888889,
      "completions/min_length": 9.11111111111111,
      "completions/min_terminated_length": 9.11111111111111,
      "epoch": 0.47723834410672533,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014061485417187214,
      "kl": 1.0381863713264465,
      "learning_rate": 2.472888888888889e-06,
      "loss": 0.001,
      "num_tokens": 8663938.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6940
    },
    {
      "completion_length": 12.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 12.9,
      "completions/mean_terminated_length": 12.9,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.47792600742676383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.017283568158745766,
      "kl": 1.2144360840320587,
      "learning_rate": 2.4684444444444448e-06,
      "loss": 0.0012,
      "num_tokens": 8676042.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6950
    },
    {
      "completion_length": 12.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 12.575,
      "completions/mean_terminated_length": 12.575,
      "completions/min_length": 10.6,
      "completions/min_terminated_length": 10.6,
      "epoch": 0.4786136707468024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006373599637299776,
      "kl": 1.0703495502471925,
      "learning_rate": 2.4640000000000005e-06,
      "loss": 0.0011,
      "num_tokens": 8689881.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6960
    },
    {
      "completion_length": 11.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.525,
      "completions/mean_terminated_length": 11.525,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.4793013340668409,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003493634983897209,
      "kl": 1.2606799006462097,
      "learning_rate": 2.4595555555555557e-06,
      "loss": 0.0013,
      "num_tokens": 8701782.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6970
    },
    {
      "completion_length": 12.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 12.45,
      "completions/mean_terminated_length": 12.45,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.4799889973868794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011007771827280521,
      "kl": 1.0327221691608428,
      "learning_rate": 2.4551111111111114e-06,
      "loss": 0.001,
      "num_tokens": 8713824.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6980
    },
    {
      "completion_length": 14.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 23.4,
      "completions/max_terminated_length": 23.4,
      "completions/mean_length": 14.05,
      "completions/mean_terminated_length": 14.05,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.4806766607069179,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0025435087736696005,
      "kl": 1.0930546462535857,
      "learning_rate": 2.4506666666666667e-06,
      "loss": 0.0011,
      "num_tokens": 8728734.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 6990
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.1,
      "completions/max_terminated_length": 17.1,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.4813643240269564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004313310608267784,
      "kl": 1.1628154873847962,
      "learning_rate": 2.4462222222222224e-06,
      "loss": 0.0012,
      "num_tokens": 8743119.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7000
    },
    {
      "completion_length": 16.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 31.8,
      "completions/max_terminated_length": 31.8,
      "completions/mean_length": 16.1,
      "completions/mean_terminated_length": 16.1,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.4820519873469949,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007840165868401527,
      "kl": 1.2261770963668823,
      "learning_rate": 2.441777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 8755975.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7010
    },
    {
      "completion_length": 35.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 105.5,
      "completions/max_terminated_length": 105.5,
      "completions/mean_length": 35.4,
      "completions/mean_terminated_length": 35.4,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.4827396506670334,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003724177833646536,
      "kl": 1.0877568125724792,
      "learning_rate": 2.4373333333333333e-06,
      "loss": 0.0011,
      "num_tokens": 8768903.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7020
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.48342731398707195,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032062076497823,
      "kl": 1.1118020951747893,
      "learning_rate": 2.432888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 8780440.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7030
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 10.4,
      "completions/min_terminated_length": 10.4,
      "epoch": 0.48411497730711045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.021040547639131546,
      "kl": 1.0397081434726716,
      "learning_rate": 2.4284444444444447e-06,
      "loss": 0.001,
      "num_tokens": 8792553.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7040
    },
    {
      "completion_length": 12.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 12.375,
      "completions/mean_terminated_length": 12.375,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.48480264062714895,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004737348761409521,
      "kl": 1.1431108355522155,
      "learning_rate": 2.4240000000000004e-06,
      "loss": 0.0011,
      "num_tokens": 8804176.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7050
    },
    {
      "completion_length": 13.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.1,
      "completions/max_terminated_length": 17.1,
      "completions/mean_length": 13.075,
      "completions/mean_terminated_length": 13.075,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.48549030394718745,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0026906414423137903,
      "kl": 0.9650578558444977,
      "learning_rate": 2.4195555555555557e-06,
      "loss": 0.001,
      "num_tokens": 8816415.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7060
    },
    {
      "completion_length": 12.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 12.225,
      "completions/mean_terminated_length": 12.225,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.48617796726722595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002324128057807684,
      "kl": 1.049161559343338,
      "learning_rate": 2.4151111111111114e-06,
      "loss": 0.001,
      "num_tokens": 8828372.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7070
    },
    {
      "completion_length": 11.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.975,
      "completions/mean_terminated_length": 11.975,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.48686563058726445,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003440011525526643,
      "kl": 1.1597104012966155,
      "learning_rate": 2.410666666666667e-06,
      "loss": 0.0012,
      "num_tokens": 8839839.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7080
    },
    {
      "completion_length": 12.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 12.825,
      "completions/mean_terminated_length": 12.825,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.487553293907303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004029946867376566,
      "kl": 0.98714559674263,
      "learning_rate": 2.4062222222222223e-06,
      "loss": 0.001,
      "num_tokens": 8852208.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7090
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 10.7,
      "completions/min_terminated_length": 10.7,
      "epoch": 0.4882409572273415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004136668052524328,
      "kl": 0.9978300929069519,
      "learning_rate": 2.401777777777778e-06,
      "loss": 0.001,
      "num_tokens": 8865298.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7100
    },
    {
      "completion_length": 14.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.4,
      "completions/max_terminated_length": 20.4,
      "completions/mean_length": 14.0,
      "completions/mean_terminated_length": 14.0,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.48892862054738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0393175333738327,
      "kl": 1.0882606029510498,
      "learning_rate": 2.3973333333333333e-06,
      "loss": 0.0011,
      "num_tokens": 8877470.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7110
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.4896162838674185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003470748895779252,
      "kl": 1.0130544245243072,
      "learning_rate": 2.392888888888889e-06,
      "loss": 0.001,
      "num_tokens": 8891472.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7120
    },
    {
      "completion_length": 43.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 121.6,
      "completions/max_terminated_length": 121.6,
      "completions/mean_length": 43.35,
      "completions/mean_terminated_length": 43.35,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.490303947187457,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.005588744767010212,
      "kl": 1.0089176952838899,
      "learning_rate": 2.3884444444444447e-06,
      "loss": 0.001,
      "num_tokens": 8903570.0,
      "reward": 5.9,
      "reward_std": 0.15773502588272095,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.45,
      "rewards/check_response_quality/std": 0.07886751294136048,
      "rewards/match_format_approximately/mean": 0.95,
      "rewards/match_format_approximately/std": 0.07886751294136048,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7130
    },
    {
      "completion_length": 21.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 53.3,
      "completions/max_terminated_length": 53.3,
      "completions/mean_length": 21.725,
      "completions/mean_terminated_length": 21.725,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.4909916105074955,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0038998995441943407,
      "kl": 2.2435892522335052,
      "learning_rate": 2.3840000000000004e-06,
      "loss": 0.0022,
      "num_tokens": 8916203.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7140
    },
    {
      "completion_length": 12.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.3,
      "completions/max_terminated_length": 16.3,
      "completions/mean_length": 12.6,
      "completions/mean_terminated_length": 12.6,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.491679273827534,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.7611438035964966,
      "kl": 1.965437227487564,
      "learning_rate": 2.3795555555555557e-06,
      "loss": 0.002,
      "num_tokens": 8927227.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7150
    },
    {
      "completion_length": 11.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.65,
      "completions/mean_terminated_length": 11.65,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.49236693714757257,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.02354721911251545,
      "kl": 1.0878031313419343,
      "learning_rate": 2.3751111111111113e-06,
      "loss": 0.0011,
      "num_tokens": 8940697.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7160
    },
    {
      "completion_length": 20.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 45.6,
      "completions/max_terminated_length": 45.6,
      "completions/mean_length": 20.825,
      "completions/mean_terminated_length": 20.825,
      "completions/min_length": 10.4,
      "completions/min_terminated_length": 10.4,
      "epoch": 0.49305460046761107,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005910597741603851,
      "kl": 1.0086887955665589,
      "learning_rate": 2.370666666666667e-06,
      "loss": 0.001,
      "num_tokens": 8952058.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7170
    },
    {
      "completion_length": 11.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.95,
      "completions/mean_terminated_length": 11.95,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.49374226378764957,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003222224535420537,
      "kl": 1.2605473756790162,
      "learning_rate": 2.3662222222222227e-06,
      "loss": 0.0013,
      "num_tokens": 8965632.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7180
    },
    {
      "completion_length": 23.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 63.1,
      "completions/max_terminated_length": 63.1,
      "completions/mean_length": 23.6,
      "completions/mean_terminated_length": 23.6,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.49442992710768807,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004417297430336475,
      "kl": 1.1330610036849975,
      "learning_rate": 2.361777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 8977804.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7190
    },
    {
      "completion_length": 44.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 140.2,
      "completions/max_terminated_length": 140.2,
      "completions/mean_length": 44.475,
      "completions/mean_terminated_length": 44.475,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.4951175904277266,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.019977256655693054,
      "kl": 1.0677195250988007,
      "learning_rate": 2.3573333333333333e-06,
      "loss": 0.0011,
      "num_tokens": 8992319.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7200
    },
    {
      "completion_length": 29.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 87.3,
      "completions/max_terminated_length": 87.3,
      "completions/mean_length": 29.75,
      "completions/mean_terminated_length": 29.75,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.4958052537477651,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.008993498049676418,
      "kl": 1.0857190370559693,
      "learning_rate": 2.352888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 9004757.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7210
    },
    {
      "completion_length": 13.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.8,
      "completions/max_terminated_length": 18.8,
      "completions/mean_length": 13.55,
      "completions/mean_terminated_length": 13.55,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.49649291706780363,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004971934948116541,
      "kl": 1.0797606348991393,
      "learning_rate": 2.3484444444444447e-06,
      "loss": 0.0011,
      "num_tokens": 9018407.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7220
    },
    {
      "completion_length": 12.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 12.15,
      "completions/mean_terminated_length": 12.15,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.49718058038784213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.022533122450113297,
      "kl": 1.2900502800941467,
      "learning_rate": 2.3440000000000003e-06,
      "loss": 0.0013,
      "num_tokens": 9031465.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7230
    },
    {
      "completion_length": 12.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.4,
      "completions/max_terminated_length": 19.4,
      "completions/mean_length": 12.75,
      "completions/mean_terminated_length": 12.75,
      "completions/min_length": 8.3,
      "completions/min_terminated_length": 8.3,
      "epoch": 0.49786824370788063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030792122706770897,
      "kl": 1.3528812944889068,
      "learning_rate": 2.3395555555555556e-06,
      "loss": 0.0014,
      "num_tokens": 9042471.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7240
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 8.1,
      "completions/min_terminated_length": 8.1,
      "epoch": 0.49855590702791913,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0058341664262115955,
      "kl": 1.2619741141796113,
      "learning_rate": 2.3351111111111113e-06,
      "loss": 0.0013,
      "num_tokens": 9052594.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7250
    },
    {
      "completion_length": 20.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 49.1,
      "completions/max_terminated_length": 49.1,
      "completions/mean_length": 20.475,
      "completions/mean_terminated_length": 20.475,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.49924357034795763,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 1.8776947259902954,
      "kl": 1.384527599811554,
      "learning_rate": 2.330666666666667e-06,
      "loss": 0.0014,
      "num_tokens": 9064341.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7260
    },
    {
      "completion_length": 12.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 12.875,
      "completions/mean_terminated_length": 12.875,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.49993123366799613,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004909676965326071,
      "kl": 1.2514336943626403,
      "learning_rate": 2.3262222222222227e-06,
      "loss": 0.0013,
      "num_tokens": 9075776.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7270
    },
    {
      "completion_length": 13.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.3,
      "completions/max_terminated_length": 18.3,
      "completions/mean_length": 13.025,
      "completions/mean_terminated_length": 13.025,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5006188969880346,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.009278550744056702,
      "kl": 1.2726584792137146,
      "learning_rate": 2.321777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 9090041.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7280
    },
    {
      "completion_length": 36.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 110.0,
      "completions/max_terminated_length": 110.0,
      "completions/mean_length": 36.35,
      "completions/mean_terminated_length": 36.35,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5013065603080732,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.011710338294506073,
      "kl": 1.0427227377891541,
      "learning_rate": 2.3173333333333336e-06,
      "loss": 0.001,
      "num_tokens": 9105039.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7290
    },
    {
      "completion_length": 12.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.2,
      "completions/max_terminated_length": 18.2,
      "completions/mean_length": 12.925,
      "completions/mean_terminated_length": 12.925,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5019942236281116,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004796651192009449,
      "kl": 1.1195420622825623,
      "learning_rate": 2.312888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 9116628.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7300
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.5026818869481502,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037184860557317734,
      "kl": 1.1613843977451324,
      "learning_rate": 2.3084444444444446e-06,
      "loss": 0.0012,
      "num_tokens": 9129592.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7310
    },
    {
      "completion_length": 23.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 59.6,
      "completions/max_terminated_length": 59.6,
      "completions/mean_length": 23.125,
      "completions/mean_terminated_length": 23.125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5033695502681887,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004970069508999586,
      "kl": 1.218816888332367,
      "learning_rate": 2.3040000000000003e-06,
      "loss": 0.0012,
      "num_tokens": 9141525.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7320
    },
    {
      "completion_length": 14.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.9,
      "completions/max_terminated_length": 22.9,
      "completions/mean_length": 14.8,
      "completions/mean_terminated_length": 14.8,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5040572135882272,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029654251411557198,
      "kl": 1.0582695841789245,
      "learning_rate": 2.2995555555555556e-06,
      "loss": 0.0011,
      "num_tokens": 9154301.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7330
    },
    {
      "completion_length": 13.4,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.5,
      "completions/max_terminated_length": 20.5,
      "completions/mean_length": 13.4,
      "completions/mean_terminated_length": 13.4,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.5047448769082657,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006998724769800901,
      "kl": 1.0817440152168274,
      "learning_rate": 2.2951111111111113e-06,
      "loss": 0.0011,
      "num_tokens": 9166301.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7340
    },
    {
      "completion_length": 15.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 26.8,
      "completions/max_terminated_length": 26.8,
      "completions/mean_length": 15.25,
      "completions/mean_terminated_length": 15.25,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5054325402283042,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 2.057863235473633,
      "kl": 1.164160829782486,
      "learning_rate": 2.290666666666667e-06,
      "loss": 0.0012,
      "num_tokens": 9178547.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7350
    },
    {
      "completion_length": 12.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 12.875,
      "completions/mean_terminated_length": 12.875,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.5061202035483428,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038249208591878414,
      "kl": 0.9877580821514129,
      "learning_rate": 2.2862222222222226e-06,
      "loss": 0.001,
      "num_tokens": 9190510.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7360
    },
    {
      "completion_length": 21.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 51.9,
      "completions/max_terminated_length": 51.9,
      "completions/mean_length": 21.55,
      "completions/mean_terminated_length": 21.55,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5068078668683812,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 1.4600623846054077,
      "kl": 1.057120794057846,
      "learning_rate": 2.281777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 9201872.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7370
    },
    {
      "completion_length": 28.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 80.0,
      "completions/max_terminated_length": 80.0,
      "completions/mean_length": 28.325,
      "completions/mean_terminated_length": 28.325,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.5074955301884198,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007701764348894358,
      "kl": 1.1267670691013336,
      "learning_rate": 2.2773333333333336e-06,
      "loss": 0.0011,
      "num_tokens": 9214445.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7380
    },
    {
      "completion_length": 44.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 141.4,
      "completions/max_terminated_length": 141.4,
      "completions/mean_length": 44.175,
      "completions/mean_terminated_length": 44.175,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.5081831935084583,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 1.5618634223937988,
      "kl": 1.123524260520935,
      "learning_rate": 2.2728888888888893e-06,
      "loss": 0.0011,
      "num_tokens": 9230012.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7390
    },
    {
      "completion_length": 13.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.4,
      "completions/max_terminated_length": 18.4,
      "completions/mean_length": 13.175,
      "completions/mean_terminated_length": 13.175,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5088708568284968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005219184327870607,
      "kl": 1.0605081737041473,
      "learning_rate": 2.2684444444444446e-06,
      "loss": 0.0011,
      "num_tokens": 9242431.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7400
    },
    {
      "completion_length": 12.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 12.45,
      "completions/mean_terminated_length": 12.45,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5095585201485353,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0051316008903086185,
      "kl": 1.2531267821788787,
      "learning_rate": 2.2640000000000003e-06,
      "loss": 0.0013,
      "num_tokens": 9253669.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7410
    },
    {
      "completion_length": 12.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 12.15,
      "completions/mean_terminated_length": 12.15,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5102461834685738,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031868712976574898,
      "kl": 1.106162852048874,
      "learning_rate": 2.2595555555555555e-06,
      "loss": 0.0011,
      "num_tokens": 9265887.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7420
    },
    {
      "completion_length": 15.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 25.9,
      "completions/max_terminated_length": 25.9,
      "completions/mean_length": 15.25,
      "completions/mean_terminated_length": 15.25,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5109338467886123,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005863267928361893,
      "kl": 1.2646816730499268,
      "learning_rate": 2.2551111111111112e-06,
      "loss": 0.0013,
      "num_tokens": 9278577.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7430
    },
    {
      "completion_length": 11.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.525,
      "completions/mean_terminated_length": 11.525,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5116215101086508,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004301924724131823,
      "kl": 1.326978874206543,
      "learning_rate": 2.250666666666667e-06,
      "loss": 0.0013,
      "num_tokens": 9289242.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7440
    },
    {
      "completion_length": 15.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 32.2,
      "completions/max_terminated_length": 32.2,
      "completions/mean_length": 15.85,
      "completions/mean_terminated_length": 15.85,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5123091734286893,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.013630717992782593,
      "kl": 1.1819639325141906,
      "learning_rate": 2.2462222222222226e-06,
      "loss": 0.0012,
      "num_tokens": 9300732.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7450
    },
    {
      "completion_length": 11.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.7,
      "completions/max_terminated_length": 15.7,
      "completions/mean_length": 11.425,
      "completions/mean_terminated_length": 11.425,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.5129968367487279,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00520959822461009,
      "kl": 1.3894258320331574,
      "learning_rate": 2.241777777777778e-06,
      "loss": 0.0014,
      "num_tokens": 9314441.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7460
    },
    {
      "completion_length": 12.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.6,
      "completions/max_terminated_length": 18.6,
      "completions/mean_length": 12.275,
      "completions/mean_terminated_length": 12.275,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.5136845000687663,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004836928565055132,
      "kl": 1.126960998773575,
      "learning_rate": 2.2373333333333336e-06,
      "loss": 0.0011,
      "num_tokens": 9327980.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7470
    },
    {
      "completion_length": 21.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 52.5,
      "completions/max_terminated_length": 52.5,
      "completions/mean_length": 21.275,
      "completions/mean_terminated_length": 21.275,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5143721633888049,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004135287832468748,
      "kl": 1.2688943803310395,
      "learning_rate": 2.2328888888888893e-06,
      "loss": 0.0013,
      "num_tokens": 9341155.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7480
    },
    {
      "completion_length": 16.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 33.5,
      "completions/max_terminated_length": 33.5,
      "completions/mean_length": 16.175,
      "completions/mean_terminated_length": 16.175,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5150598267088433,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.006331494078040123,
      "kl": 1.248352313041687,
      "learning_rate": 2.228444444444445e-06,
      "loss": 0.0012,
      "num_tokens": 9353610.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.45,
      "rewards/check_coherence/std": 0.1,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7490
    },
    {
      "completion_length": 12.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.1,
      "completions/max_terminated_length": 18.1,
      "completions/mean_length": 12.975,
      "completions/mean_terminated_length": 12.975,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5157474900288819,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027948105707764626,
      "kl": 1.1735309839248658,
      "learning_rate": 2.2240000000000002e-06,
      "loss": 0.0012,
      "num_tokens": 9366841.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7500
    },
    {
      "completion_length": 12.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 12.2,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5164351533489203,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0080268494784832,
      "kl": 1.3003155648708344,
      "learning_rate": 2.2195555555555555e-06,
      "loss": 0.0013,
      "num_tokens": 9377665.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7510
    },
    {
      "completion_length": 13.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 13.175,
      "completions/mean_terminated_length": 13.175,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.5171228166689589,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034015493001788855,
      "kl": 1.1480665683746338,
      "learning_rate": 2.215111111111111e-06,
      "loss": 0.0011,
      "num_tokens": 9390644.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7520
    },
    {
      "completion_length": 11.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.05,
      "completions/mean_terminated_length": 11.05,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5178104799889974,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002507806057110429,
      "kl": 1.3187538743019105,
      "learning_rate": 2.210666666666667e-06,
      "loss": 0.0013,
      "num_tokens": 9403746.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7530
    },
    {
      "completion_length": 15.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 29.6,
      "completions/max_terminated_length": 29.6,
      "completions/mean_length": 15.55,
      "completions/mean_terminated_length": 15.55,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5184981433090359,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.040676817297935486,
      "kl": 1.3105673789978027,
      "learning_rate": 2.2062222222222226e-06,
      "loss": 0.0013,
      "num_tokens": 9416300.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7540
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5191858066290744,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0030532637611031532,
      "kl": 1.3284153163433075,
      "learning_rate": 2.201777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 9428207.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7550
    },
    {
      "completion_length": 11.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.075,
      "completions/mean_terminated_length": 11.075,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.5198734699491129,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032129832543432713,
      "kl": 1.3229759514331818,
      "learning_rate": 2.1973333333333335e-06,
      "loss": 0.0013,
      "num_tokens": 9442022.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7560
    },
    {
      "completion_length": 11.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 11.625,
      "completions/mean_terminated_length": 11.625,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.5205611332691514,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005039629060775042,
      "kl": 1.0451285064220428,
      "learning_rate": 2.1928888888888892e-06,
      "loss": 0.001,
      "num_tokens": 9456135.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7570
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.5212487965891899,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011741072870790958,
      "kl": 1.2608206033706666,
      "learning_rate": 2.188444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 9465490.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7580
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.5219364599092284,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0034133840817958117,
      "kl": 1.2925956785678863,
      "learning_rate": 2.184e-06,
      "loss": 0.0013,
      "num_tokens": 9479079.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7590
    },
    {
      "completion_length": 13.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.4,
      "completions/max_terminated_length": 21.4,
      "completions/mean_length": 13.45,
      "completions/mean_terminated_length": 13.45,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.522624123229267,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.014618651010096073,
      "kl": 1.281085979938507,
      "learning_rate": 2.179555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 9490605.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7600
    },
    {
      "completion_length": 12.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 12.7,
      "completions/mean_terminated_length": 12.7,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5233117865493054,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.03030218929052353,
      "kl": 1.2633468508720398,
      "learning_rate": 2.175111111111111e-06,
      "loss": 0.0013,
      "num_tokens": 9503505.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7610
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.523999449869344,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029963438864797354,
      "kl": 1.3427102744579316,
      "learning_rate": 2.170666666666667e-06,
      "loss": 0.0013,
      "num_tokens": 9515646.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7620
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5246871131893824,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005397267173975706,
      "kl": 1.4040341019630431,
      "learning_rate": 2.1662222222222225e-06,
      "loss": 0.0014,
      "num_tokens": 9527161.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7630
    },
    {
      "completion_length": 11.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 11.975,
      "completions/mean_terminated_length": 11.975,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.525374776509421,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.014865939505398273,
      "kl": 1.3119125723838807,
      "learning_rate": 2.161777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 9539084.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7640
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 8.2,
      "completions/min_terminated_length": 8.2,
      "epoch": 0.5260624398294595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0044557503424584866,
      "kl": 1.3221172630786895,
      "learning_rate": 2.1573333333333335e-06,
      "loss": 0.0013,
      "num_tokens": 9550292.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7650
    },
    {
      "completion_length": 13.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.8,
      "completions/max_terminated_length": 18.8,
      "completions/mean_length": 13.35,
      "completions/mean_terminated_length": 13.35,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.526750103149498,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037254090420901775,
      "kl": 1.1248468935489655,
      "learning_rate": 2.152888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 9562470.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7660
    },
    {
      "completion_length": 13.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.7,
      "completions/max_terminated_length": 17.7,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5274377664695366,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002507027005776763,
      "kl": 1.1201979637145996,
      "learning_rate": 2.148444444444445e-06,
      "loss": 0.0011,
      "num_tokens": 9577014.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7670
    },
    {
      "completion_length": 17.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 37.5,
      "completions/max_terminated_length": 37.5,
      "completions/mean_length": 17.025,
      "completions/mean_terminated_length": 17.025,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.528125429789575,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003964269068092108,
      "kl": 1.2861777245998383,
      "learning_rate": 2.144e-06,
      "loss": 0.0013,
      "num_tokens": 9589907.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7680
    },
    {
      "completion_length": 11.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 11.175,
      "completions/mean_terminated_length": 11.175,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.5288130931096136,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002470179693773389,
      "kl": 1.1973251819610595,
      "learning_rate": 2.139555555555556e-06,
      "loss": 0.0012,
      "num_tokens": 9602342.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7690
    },
    {
      "completion_length": 12.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.7,
      "completions/max_terminated_length": 17.7,
      "completions/mean_length": 12.375,
      "completions/mean_terminated_length": 12.375,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.529500756429652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.012987960129976273,
      "kl": 1.1113298773765563,
      "learning_rate": 2.1351111111111115e-06,
      "loss": 0.0011,
      "num_tokens": 9614297.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7700
    },
    {
      "completion_length": 12.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5301884197496906,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004978030454367399,
      "kl": 1.324486207962036,
      "learning_rate": 2.130666666666667e-06,
      "loss": 0.0013,
      "num_tokens": 9626970.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7710
    },
    {
      "completion_length": 11.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.575,
      "completions/mean_terminated_length": 11.575,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.5308760830697291,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.027692638337612152,
      "kl": 1.2244141578674317,
      "learning_rate": 2.1262222222222225e-06,
      "loss": 0.0012,
      "num_tokens": 9638625.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7720
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5315637463897676,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003005094826221466,
      "kl": 1.3298441231250764,
      "learning_rate": 2.1217777777777778e-06,
      "loss": 0.0013,
      "num_tokens": 9650808.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7730
    },
    {
      "completion_length": 13.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.8,
      "completions/max_terminated_length": 20.8,
      "completions/mean_length": 13.9,
      "completions/mean_terminated_length": 13.9,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.5322514097098061,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0026819314807653427,
      "kl": 1.2448628067970275,
      "learning_rate": 2.1173333333333334e-06,
      "loss": 0.0012,
      "num_tokens": 9661696.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7740
    },
    {
      "completion_length": 17.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 34.7,
      "completions/max_terminated_length": 34.7,
      "completions/mean_length": 17.225,
      "completions/mean_terminated_length": 17.225,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5329390730298446,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002824925584718585,
      "kl": 1.2569087088108062,
      "learning_rate": 2.112888888888889e-06,
      "loss": 0.0013,
      "num_tokens": 9673633.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7750
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5336267363498831,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005798778962343931,
      "kl": 1.2169726014137268,
      "learning_rate": 2.108444444444445e-06,
      "loss": 0.0012,
      "num_tokens": 9685013.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7760
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 8.4,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.5343143996699216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027706269174814224,
      "kl": 1.3897801697254182,
      "learning_rate": 2.104e-06,
      "loss": 0.0014,
      "num_tokens": 9696738.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7770
    },
    {
      "completion_length": 13.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.4,
      "completions/max_terminated_length": 20.4,
      "completions/mean_length": 13.375,
      "completions/mean_terminated_length": 13.375,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5350020629899601,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005443184170871973,
      "kl": 1.1451765239238738,
      "learning_rate": 2.099555555555556e-06,
      "loss": 0.0011,
      "num_tokens": 9709865.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7780
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.5356897263099987,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002919774502515793,
      "kl": 1.372281551361084,
      "learning_rate": 2.0951111111111115e-06,
      "loss": 0.0014,
      "num_tokens": 9721363.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7790
    },
    {
      "completion_length": 18.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 38.7,
      "completions/max_terminated_length": 38.7,
      "completions/mean_length": 18.75,
      "completions/mean_terminated_length": 18.75,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5363773896300371,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0032499770168215036,
      "kl": 1.1967048406600953,
      "learning_rate": 2.0906666666666668e-06,
      "loss": 0.0012,
      "num_tokens": 9734369.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7800
    },
    {
      "completion_length": 12.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 12.825,
      "completions/mean_terminated_length": 12.825,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5370650529500757,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038574356585741043,
      "kl": 1.216644722223282,
      "learning_rate": 2.0862222222222224e-06,
      "loss": 0.0012,
      "num_tokens": 9745214.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7810
    },
    {
      "completion_length": 12.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.8,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5377527162701141,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.010060185566544533,
      "kl": 1.2828055381774903,
      "learning_rate": 2.0817777777777777e-06,
      "loss": 0.0013,
      "num_tokens": 9756951.0,
      "reward": 5.9875,
      "reward_std": 0.025,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7820
    },
    {
      "completion_length": 12.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.2,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 12.55,
      "completions/mean_terminated_length": 12.55,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5384403795901527,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007177610415965319,
      "kl": 1.303609848022461,
      "learning_rate": 2.0773333333333334e-06,
      "loss": 0.0013,
      "num_tokens": 9769141.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7830
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5391280429101911,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00721388915553689,
      "kl": 1.2078775942325592,
      "learning_rate": 2.072888888888889e-06,
      "loss": 0.0012,
      "num_tokens": 9782721.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7840
    },
    {
      "completion_length": 12.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 12.875,
      "completions/mean_terminated_length": 12.875,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.5398157062302297,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.007724335417151451,
      "kl": 1.339196938276291,
      "learning_rate": 2.068444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 9795264.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7850
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.5,
      "completions/max_terminated_length": 14.5,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5405033695502682,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0032738870941102505,
      "kl": 1.2018405854701997,
      "learning_rate": 2.064e-06,
      "loss": 0.0012,
      "num_tokens": 9805751.0,
      "reward": 5.925,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.425,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7860
    },
    {
      "completion_length": 12.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 12.275,
      "completions/mean_terminated_length": 12.275,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5411910328703067,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003743327222764492,
      "kl": 1.7937437176704407,
      "learning_rate": 2.0595555555555558e-06,
      "loss": 0.0018,
      "num_tokens": 9818498.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7870
    },
    {
      "completion_length": 13.15,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.8,
      "completions/max_terminated_length": 19.8,
      "completions/mean_length": 13.15,
      "completions/mean_terminated_length": 13.15,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5418786961903452,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006482637953013182,
      "kl": 1.3055089831352233,
      "learning_rate": 2.0551111111111114e-06,
      "loss": 0.0013,
      "num_tokens": 9828884.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7880
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.5425663595103837,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032682991586625576,
      "kl": 1.2907763242721557,
      "learning_rate": 2.0506666666666667e-06,
      "loss": 0.0013,
      "num_tokens": 9840316.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7890
    },
    {
      "completion_length": 13.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 23.5,
      "completions/max_terminated_length": 23.5,
      "completions/mean_length": 13.85,
      "completions/mean_terminated_length": 13.85,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.5432540228304222,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 2.878571033477783,
      "kl": 1.3566003799438477,
      "learning_rate": 2.0462222222222224e-06,
      "loss": 0.0014,
      "num_tokens": 9852170.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7900
    },
    {
      "completion_length": 12.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5439416861504607,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015462033450603485,
      "kl": 1.1428892016410828,
      "learning_rate": 2.041777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 9865254.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7910
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5446293494704992,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004104136023670435,
      "kl": 1.057891947031021,
      "learning_rate": 2.0373333333333334e-06,
      "loss": 0.0011,
      "num_tokens": 9877294.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7920
    },
    {
      "completion_length": 32.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 94.1,
      "completions/max_terminated_length": 94.1,
      "completions/mean_length": 32.025,
      "completions/mean_terminated_length": 32.025,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5453170127905378,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.00987865962088108,
      "kl": 1.331625509262085,
      "learning_rate": 2.032888888888889e-06,
      "loss": 0.0013,
      "num_tokens": 9890899.0,
      "reward": 5.9125,
      "reward_std": 0.175,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4625,
      "rewards/check_response_quality/std": 0.075,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7930
    },
    {
      "completion_length": 13.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 13.05,
      "completions/mean_terminated_length": 13.05,
      "completions/min_length": 10.9,
      "completions/min_terminated_length": 10.9,
      "epoch": 0.5460046761105762,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002187013393267989,
      "kl": 1.2939410388469696,
      "learning_rate": 2.0284444444444447e-06,
      "loss": 0.0013,
      "num_tokens": 9903709.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7940
    },
    {
      "completion_length": 13.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 10.7,
      "completions/min_terminated_length": 10.7,
      "epoch": 0.5466923394306148,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029057443607598543,
      "kl": 0.9948434948921203,
      "learning_rate": 2.024e-06,
      "loss": 0.001,
      "num_tokens": 9915857.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7950
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.6,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5473800027506532,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022877950686961412,
      "kl": 1.2410366594791413,
      "learning_rate": 2.0195555555555557e-06,
      "loss": 0.0012,
      "num_tokens": 9929511.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7960
    },
    {
      "completion_length": 20.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 46.6,
      "completions/max_terminated_length": 46.6,
      "completions/mean_length": 20.0,
      "completions/mean_terminated_length": 20.0,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.5480676660706918,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.013363093137741089,
      "kl": 1.0715976715087892,
      "learning_rate": 2.0151111111111114e-06,
      "loss": 0.0011,
      "num_tokens": 9942003.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7970
    },
    {
      "completion_length": 13.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 13.0,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.5487553293907304,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004301813896745443,
      "kl": 1.037192702293396,
      "learning_rate": 2.0106666666666667e-06,
      "loss": 0.001,
      "num_tokens": 9953631.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7980
    },
    {
      "completion_length": 14.875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 25.3,
      "completions/max_terminated_length": 25.3,
      "completions/mean_length": 14.875,
      "completions/mean_terminated_length": 14.875,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.5494429927107688,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005624216981232166,
      "kl": 1.1227632999420165,
      "learning_rate": 2.0062222222222224e-06,
      "loss": 0.0011,
      "num_tokens": 9963874.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 7990
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5501306560308074,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002584875328466296,
      "kl": 1.117231160402298,
      "learning_rate": 2.001777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 9976807.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8000
    },
    {
      "completion_length": 15.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 29.9,
      "completions/max_terminated_length": 29.9,
      "completions/mean_length": 15.75,
      "completions/mean_terminated_length": 15.75,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.5508183193508458,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004169847350567579,
      "kl": 1.0605437874794006,
      "learning_rate": 1.9973333333333337e-06,
      "loss": 0.0011,
      "num_tokens": 9989301.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8010
    },
    {
      "completion_length": 12.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5515059826708844,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005917856469750404,
      "kl": 1.1970151841640473,
      "learning_rate": 1.992888888888889e-06,
      "loss": 0.0012,
      "num_tokens": 10001573.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8020
    },
    {
      "completion_length": 11.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.725,
      "completions/mean_terminated_length": 11.725,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5521936459909228,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 3.603567600250244,
      "kl": 1.2946309864521026,
      "learning_rate": 1.9884444444444447e-06,
      "loss": 0.0013,
      "num_tokens": 10014162.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8030
    },
    {
      "completion_length": 12.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 12.1,
      "completions/mean_terminated_length": 12.1,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.5528813093109614,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0021072819363325834,
      "kl": 1.2057231783866882,
      "learning_rate": 1.984e-06,
      "loss": 0.0012,
      "num_tokens": 10026470.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8040
    },
    {
      "completion_length": 12.2,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.5,
      "completions/max_terminated_length": 17.5,
      "completions/mean_length": 12.2,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5535689726309999,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.18881727755069733,
      "kl": 1.5855478703975678,
      "learning_rate": 1.9795555555555557e-06,
      "loss": 0.0016,
      "num_tokens": 10038874.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8050
    },
    {
      "completion_length": 11.95,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 11.95,
      "completions/mean_terminated_length": 11.95,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5542566359510384,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004419395700097084,
      "kl": 1.1470963537693024,
      "learning_rate": 1.9751111111111114e-06,
      "loss": 0.0011,
      "num_tokens": 10049296.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8060
    },
    {
      "completion_length": 14.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.6,
      "completions/max_terminated_length": 20.6,
      "completions/mean_length": 14.3,
      "completions/mean_terminated_length": 14.3,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.5549442992710769,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002880280837416649,
      "kl": 0.9816267251968384,
      "learning_rate": 1.9706666666666666e-06,
      "loss": 0.001,
      "num_tokens": 10062172.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8070
    },
    {
      "completion_length": 12.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 12.925,
      "completions/mean_terminated_length": 12.925,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5556319625911154,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005350610241293907,
      "kl": 1.1550312280654906,
      "learning_rate": 1.9662222222222223e-06,
      "loss": 0.0012,
      "num_tokens": 10076217.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8080
    },
    {
      "completion_length": 13.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.9,
      "completions/max_terminated_length": 17.9,
      "completions/mean_length": 13.5,
      "completions/mean_terminated_length": 13.5,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.5563196259111539,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005526478867977858,
      "kl": 1.0993483304977416,
      "learning_rate": 1.961777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 10089445.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8090
    },
    {
      "completion_length": 13.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.3,
      "completions/max_terminated_length": 18.3,
      "completions/mean_length": 13.1,
      "completions/mean_terminated_length": 13.1,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5570072892311924,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004046977963298559,
      "kl": 1.1231428921222686,
      "learning_rate": 1.9573333333333337e-06,
      "loss": 0.0011,
      "num_tokens": 10103149.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8100
    },
    {
      "completion_length": 15.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 24.5,
      "completions/max_terminated_length": 24.5,
      "completions/mean_length": 15.0,
      "completions/mean_terminated_length": 15.0,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.5576949525512309,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0036207973025739193,
      "kl": 1.399965763092041,
      "learning_rate": 1.952888888888889e-06,
      "loss": 0.0014,
      "num_tokens": 10115033.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8110
    },
    {
      "completion_length": 13.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.4,
      "completions/max_terminated_length": 18.4,
      "completions/mean_length": 13.25,
      "completions/mean_terminated_length": 13.25,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.5583826158712695,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020446887239813805,
      "kl": 1.1484535217285157,
      "learning_rate": 1.9484444444444447e-06,
      "loss": 0.0011,
      "num_tokens": 10129707.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8120
    },
    {
      "completion_length": 14.575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 26.9,
      "completions/max_terminated_length": 26.9,
      "completions/mean_length": 14.575,
      "completions/mean_terminated_length": 14.575,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5590702791913079,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.018497074022889137,
      "kl": 1.132157129049301,
      "learning_rate": 1.944e-06,
      "loss": 0.0011,
      "num_tokens": 10142250.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8130
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5597579425113465,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003500531194731593,
      "kl": 1.4015939116477967,
      "learning_rate": 1.9395555555555556e-06,
      "loss": 0.0014,
      "num_tokens": 10153132.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8140
    },
    {
      "completion_length": 15.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 27.5,
      "completions/max_terminated_length": 27.5,
      "completions/mean_length": 15.225,
      "completions/mean_terminated_length": 15.225,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5604456058313849,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002279973356053233,
      "kl": 1.0312190353870392,
      "learning_rate": 1.9351111111111113e-06,
      "loss": 0.001,
      "num_tokens": 10165629.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8150
    },
    {
      "completion_length": 20.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 43.8,
      "completions/max_terminated_length": 43.8,
      "completions/mean_length": 20.3,
      "completions/mean_terminated_length": 20.3,
      "completions/min_length": 10.5,
      "completions/min_terminated_length": 10.5,
      "epoch": 0.5611332691514235,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009704441763460636,
      "kl": 0.9771228730678558,
      "learning_rate": 1.9306666666666666e-06,
      "loss": 0.001,
      "num_tokens": 10179037.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8160
    },
    {
      "completion_length": 13.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.1,
      "completions/max_terminated_length": 22.1,
      "completions/mean_length": 13.725,
      "completions/mean_terminated_length": 13.725,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5618209324714619,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005785486660897732,
      "kl": 1.0721399009227752,
      "learning_rate": 1.9262222222222223e-06,
      "loss": 0.0011,
      "num_tokens": 10190790.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8170
    },
    {
      "completion_length": 13.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.3,
      "completions/max_terminated_length": 18.3,
      "completions/mean_length": 13.3,
      "completions/mean_terminated_length": 13.3,
      "completions/min_length": 10.6,
      "completions/min_terminated_length": 10.6,
      "epoch": 0.5625085957915005,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006073605734854937,
      "kl": 1.0811134040355683,
      "learning_rate": 1.921777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 10201646.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8180
    },
    {
      "completion_length": 21.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 53.7,
      "completions/max_terminated_length": 53.7,
      "completions/mean_length": 21.425,
      "completions/mean_terminated_length": 21.425,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.563196259111539,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.008377310819923878,
      "kl": 1.2893824517726897,
      "learning_rate": 1.9173333333333337e-06,
      "loss": 0.0013,
      "num_tokens": 10213359.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8190
    },
    {
      "completion_length": 11.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.5,
      "completions/mean_terminated_length": 11.5,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5638839224315775,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0031006948556751013,
      "kl": 1.283377367258072,
      "learning_rate": 1.912888888888889e-06,
      "loss": 0.0013,
      "num_tokens": 10225839.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8200
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.564571585751616,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019166450947523117,
      "kl": 1.1040916562080383,
      "learning_rate": 1.9084444444444446e-06,
      "loss": 0.0011,
      "num_tokens": 10238179.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8210
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.1,
      "completions/max_terminated_length": 14.1,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5652592490716545,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008414049632847309,
      "kl": 1.11358762383461,
      "learning_rate": 1.9040000000000003e-06,
      "loss": 0.0011,
      "num_tokens": 10249411.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8220
    },
    {
      "completion_length": 11.825,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.825,
      "completions/mean_terminated_length": 11.825,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.565946912391693,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.011217906139791012,
      "kl": 1.1142095625400543,
      "learning_rate": 1.8995555555555556e-06,
      "loss": 0.0011,
      "num_tokens": 10263452.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8230
    },
    {
      "completion_length": 12.35,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.7,
      "completions/max_terminated_length": 16.7,
      "completions/mean_length": 12.35,
      "completions/mean_terminated_length": 12.35,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5666345757117315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003973840270191431,
      "kl": 1.2286709368228912,
      "learning_rate": 1.8951111111111113e-06,
      "loss": 0.0012,
      "num_tokens": 10275398.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8240
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 8.6,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.56732223903177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0032951238099485636,
      "kl": 0.9890437185764313,
      "learning_rate": 1.8906666666666668e-06,
      "loss": 0.001,
      "num_tokens": 10287797.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8250
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5680099023518086,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008584411814808846,
      "kl": 1.2706584572792052,
      "learning_rate": 1.8862222222222222e-06,
      "loss": 0.0013,
      "num_tokens": 10300355.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8260
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.568697565671847,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.027631660923361778,
      "kl": 1.2103368103504182,
      "learning_rate": 1.881777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 10311192.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8270
    },
    {
      "completion_length": 12.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 12.1,
      "completions/mean_terminated_length": 12.1,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5693852289918856,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002199193462729454,
      "kl": 1.0713283360004424,
      "learning_rate": 1.8773333333333334e-06,
      "loss": 0.0011,
      "num_tokens": 10324664.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8280
    },
    {
      "completion_length": 11.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.9,
      "completions/max_terminated_length": 14.9,
      "completions/mean_length": 11.55,
      "completions/mean_terminated_length": 11.55,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.570072892311924,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.009694810956716537,
      "kl": 1.261211758852005,
      "learning_rate": 1.8728888888888891e-06,
      "loss": 0.0013,
      "num_tokens": 10334646.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8290
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.4,
      "completions/max_terminated_length": 17.4,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5707605556319626,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036631503608077765,
      "kl": 1.2292134821414948,
      "learning_rate": 1.8684444444444446e-06,
      "loss": 0.0012,
      "num_tokens": 10347247.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8300
    },
    {
      "completion_length": 12.325,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 12.325,
      "completions/mean_terminated_length": 12.325,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5714482189520012,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003963626455515623,
      "kl": 4.332015436887741,
      "learning_rate": 1.8640000000000003e-06,
      "loss": 0.0043,
      "num_tokens": 10360732.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8310
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5721358822720396,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002724467311054468,
      "kl": 1.1339951932430268,
      "learning_rate": 1.8595555555555558e-06,
      "loss": 0.0011,
      "num_tokens": 10373472.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8320
    },
    {
      "completion_length": 11.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.85,
      "completions/mean_terminated_length": 11.85,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5728235455920782,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0034971812274307013,
      "kl": 1.0714882016181946,
      "learning_rate": 1.8551111111111112e-06,
      "loss": 0.0011,
      "num_tokens": 10386234.0,
      "reward": 5.9625,
      "reward_std": 0.075,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8330
    },
    {
      "completion_length": 24.55,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 63.6,
      "completions/max_terminated_length": 63.6,
      "completions/mean_length": 24.55,
      "completions/mean_terminated_length": 24.55,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5735112089121166,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.0032173949293792248,
      "kl": 1.1109222888946533,
      "learning_rate": 1.8506666666666667e-06,
      "loss": 0.0011,
      "num_tokens": 10399544.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8340
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.5741988722321552,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018772322218865156,
      "kl": 1.2787903010845185,
      "learning_rate": 1.8462222222222222e-06,
      "loss": 0.0013,
      "num_tokens": 10411335.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8350
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5748865355521936,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0070866490714251995,
      "kl": 1.1854866743087769,
      "learning_rate": 1.8417777777777779e-06,
      "loss": 0.0012,
      "num_tokens": 10424947.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8360
    },
    {
      "completion_length": 11.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.7,
      "completions/max_terminated_length": 13.7,
      "completions/mean_length": 11.6,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.5755741988722322,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002047724789008498,
      "kl": 1.1531950116157532,
      "learning_rate": 1.8373333333333334e-06,
      "loss": 0.0012,
      "num_tokens": 10439447.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8370
    },
    {
      "completion_length": 14.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 21.9,
      "completions/max_terminated_length": 21.9,
      "completions/mean_length": 14.25,
      "completions/mean_terminated_length": 14.25,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5762618621922707,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.005259161815047264,
      "kl": 1.24915811419487,
      "learning_rate": 1.832888888888889e-06,
      "loss": 0.0012,
      "num_tokens": 10452997.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8380
    },
    {
      "completion_length": 10.725,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.2,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 10.725,
      "completions/mean_terminated_length": 10.725,
      "completions/min_length": 8.7,
      "completions/min_terminated_length": 8.7,
      "epoch": 0.5769495255123092,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033741919323801994,
      "kl": 1.3191501200199127,
      "learning_rate": 1.8284444444444445e-06,
      "loss": 0.0013,
      "num_tokens": 10465878.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8390
    },
    {
      "completion_length": 11.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.3,
      "completions/max_terminated_length": 14.3,
      "completions/mean_length": 11.3,
      "completions/mean_terminated_length": 11.3,
      "completions/min_length": 8.8,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.5776371888323477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003714599646627903,
      "kl": 1.2673472166061401,
      "learning_rate": 1.8240000000000002e-06,
      "loss": 0.0013,
      "num_tokens": 10477354.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8400
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5783248521523862,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022408729419112206,
      "kl": 1.218839818239212,
      "learning_rate": 1.8195555555555557e-06,
      "loss": 0.0012,
      "num_tokens": 10489222.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8410
    },
    {
      "completion_length": 11.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.1,
      "completions/max_terminated_length": 15.1,
      "completions/mean_length": 11.675,
      "completions/mean_terminated_length": 11.675,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5790125154724247,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0019483575597405434,
      "kl": 1.03574578166008,
      "learning_rate": 1.8151111111111114e-06,
      "loss": 0.001,
      "num_tokens": 10500381.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8420
    },
    {
      "completion_length": 11.975,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.2,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 11.975,
      "completions/mean_terminated_length": 11.975,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5797001787924632,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004584670998156071,
      "kl": 1.1955373942852021,
      "learning_rate": 1.8106666666666667e-06,
      "loss": 0.0012,
      "num_tokens": 10512416.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8430
    },
    {
      "completion_length": 18.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 43.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 18.525,
      "completions/mean_terminated_length": 18.525,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5803878421125017,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0029421942308545113,
      "kl": 1.170852828025818,
      "learning_rate": 1.8062222222222222e-06,
      "loss": 0.0012,
      "num_tokens": 10526149.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8440
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.5810755054325403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004435285460203886,
      "kl": 1.3717931151390075,
      "learning_rate": 1.8017777777777779e-06,
      "loss": 0.0014,
      "num_tokens": 10538289.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8450
    },
    {
      "completion_length": 12.5,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.5,
      "completions/max_terminated_length": 15.5,
      "completions/mean_length": 12.5,
      "completions/mean_terminated_length": 12.5,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5817631687525787,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0038708082865923643,
      "kl": 1.1633994162082673,
      "learning_rate": 1.7973333333333333e-06,
      "loss": 0.0012,
      "num_tokens": 10550281.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8460
    },
    {
      "completion_length": 11.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.2,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 11.7,
      "completions/mean_terminated_length": 11.7,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5824508320726173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018100917804986238,
      "kl": 1.089043253660202,
      "learning_rate": 1.792888888888889e-06,
      "loss": 0.0011,
      "num_tokens": 10563557.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8470
    },
    {
      "completion_length": 12.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.225,
      "completions/mean_terminated_length": 12.225,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5831384953926557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007320227101445198,
      "kl": 1.1034001350402831,
      "learning_rate": 1.7884444444444445e-06,
      "loss": 0.0011,
      "num_tokens": 10574910.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8480
    },
    {
      "completion_length": 12.45,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.6,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 12.45,
      "completions/mean_terminated_length": 12.45,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.5838261587126943,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0036800052039325237,
      "kl": 1.1248241066932678,
      "learning_rate": 1.7840000000000002e-06,
      "loss": 0.0011,
      "num_tokens": 10588092.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8490
    },
    {
      "completion_length": 12.675,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.675,
      "completions/mean_terminated_length": 12.675,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.5845138220327327,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01451704278588295,
      "kl": 1.1269919991493225,
      "learning_rate": 1.7795555555555557e-06,
      "loss": 0.0011,
      "num_tokens": 10599079.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8500
    },
    {
      "completion_length": 12.25,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 12.25,
      "completions/mean_terminated_length": 12.25,
      "completions/min_length": 9.5,
      "completions/min_terminated_length": 9.5,
      "epoch": 0.5852014853527713,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003538082353770733,
      "kl": 1.2523035109043121,
      "learning_rate": 1.7751111111111114e-06,
      "loss": 0.0013,
      "num_tokens": 10610125.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8510
    },
    {
      "completion_length": 12.7,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.3,
      "completions/max_terminated_length": 17.3,
      "completions/mean_length": 12.7,
      "completions/mean_terminated_length": 12.7,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.5858891486728098,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004502912051975727,
      "kl": 1.0592267155647277,
      "learning_rate": 1.7706666666666669e-06,
      "loss": 0.0011,
      "num_tokens": 10621961.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8520
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.6,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.5865768119928483,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00452207587659359,
      "kl": 1.2047041416168214,
      "learning_rate": 1.7662222222222225e-06,
      "loss": 0.0012,
      "num_tokens": 10633465.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8530
    },
    {
      "completion_length": 13.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.4,
      "completions/max_terminated_length": 18.4,
      "completions/mean_length": 13.925,
      "completions/mean_terminated_length": 13.925,
      "completions/min_length": 10.2,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.5872644753128868,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.057433757930994034,
      "kl": 1.1581880033016205,
      "learning_rate": 1.7617777777777778e-06,
      "loss": 0.0012,
      "num_tokens": 10644906.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8540
    },
    {
      "completion_length": 12.625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 12.625,
      "completions/mean_terminated_length": 12.625,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.5879521386329253,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.005435035564005375,
      "kl": 1.1151172339916229,
      "learning_rate": 1.7573333333333333e-06,
      "loss": 0.0011,
      "num_tokens": 10657511.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8550
    },
    {
      "completion_length": 12.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.4,
      "completions/max_terminated_length": 18.4,
      "completions/mean_length": 12.9,
      "completions/mean_terminated_length": 12.9,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5886398019529638,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004669901914894581,
      "kl": 1.1759621858596803,
      "learning_rate": 1.752888888888889e-06,
      "loss": 0.0012,
      "num_tokens": 10670195.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8560
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.2,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5893274652730024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003652728395536542,
      "kl": 1.1641576826572417,
      "learning_rate": 1.7484444444444445e-06,
      "loss": 0.0012,
      "num_tokens": 10682340.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8570
    },
    {
      "completion_length": 12.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 12.025,
      "completions/mean_terminated_length": 12.025,
      "completions/min_length": 9.2,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.5900151285930408,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0023117121309041977,
      "kl": 1.1836768448352815,
      "learning_rate": 1.7440000000000002e-06,
      "loss": 0.0012,
      "num_tokens": 10694753.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8580
    },
    {
      "completion_length": 13.1,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.9,
      "completions/max_terminated_length": 16.9,
      "completions/mean_length": 13.1,
      "completions/mean_terminated_length": 13.1,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.5907027919130794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0037394952960312366,
      "kl": 1.2706849694252014,
      "learning_rate": 1.7395555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 10707989.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8590
    },
    {
      "completion_length": 12.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 12.6,
      "completions/mean_terminated_length": 12.6,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.5913904552331178,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003737551858648658,
      "kl": 1.1382164716720582,
      "learning_rate": 1.7351111111111113e-06,
      "loss": 0.0011,
      "num_tokens": 10721337.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8600
    },
    {
      "completion_length": 11.475,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 11.475,
      "completions/mean_terminated_length": 11.475,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5920781185531564,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0045438166707754135,
      "kl": 1.2445359587669373,
      "learning_rate": 1.7306666666666668e-06,
      "loss": 0.0012,
      "num_tokens": 10732732.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8610
    },
    {
      "completion_length": 11.05,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.4,
      "completions/max_terminated_length": 13.4,
      "completions/mean_length": 11.05,
      "completions/mean_terminated_length": 11.05,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.5927657818731948,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.007625124417245388,
      "kl": 1.1852139592170716,
      "learning_rate": 1.7262222222222225e-06,
      "loss": 0.0012,
      "num_tokens": 10744194.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8620
    },
    {
      "completion_length": 11.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.8,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 11.9,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 9.6,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.5934534451932334,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.006672825198620558,
      "kl": 1.1356277108192443,
      "learning_rate": 1.721777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 10756170.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8630
    },
    {
      "completion_length": 12.65,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 12.65,
      "completions/mean_terminated_length": 12.65,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.594141108513272,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.015049074776470661,
      "kl": 1.2014117240905762,
      "learning_rate": 1.7173333333333333e-06,
      "loss": 0.0012,
      "num_tokens": 10769332.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8640
    },
    {
      "completion_length": 12.9,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 12.9,
      "completions/mean_terminated_length": 12.9,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.5948287718333104,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0029658586718142033,
      "kl": 1.1564993917942048,
      "learning_rate": 1.712888888888889e-06,
      "loss": 0.0012,
      "num_tokens": 10781564.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8650
    },
    {
      "completion_length": 12.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.1,
      "completions/max_terminated_length": 18.1,
      "completions/mean_length": 12.925,
      "completions/mean_terminated_length": 12.925,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.595516435153349,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.003762275679036975,
      "kl": 1.100640344619751,
      "learning_rate": 1.7084444444444444e-06,
      "loss": 0.0011,
      "num_tokens": 10792437.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8660
    },
    {
      "completion_length": 12.175,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.4,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 12.175,
      "completions/mean_terminated_length": 12.175,
      "completions/min_length": 8.5,
      "completions/min_terminated_length": 8.5,
      "epoch": 0.5962040984733874,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0047178128734230995,
      "kl": 1.2938535451889037,
      "learning_rate": 1.7040000000000001e-06,
      "loss": 0.0013,
      "num_tokens": 10805384.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8670
    },
    {
      "completion_length": 12.425,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.4,
      "completions/max_terminated_length": 15.4,
      "completions/mean_length": 12.425,
      "completions/mean_terminated_length": 12.425,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.596891761793426,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0027440576814115047,
      "kl": 1.2109251201152802,
      "learning_rate": 1.6995555555555556e-06,
      "loss": 0.0012,
      "num_tokens": 10818121.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8680
    },
    {
      "completion_length": 13.125,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.1,
      "completions/max_terminated_length": 18.1,
      "completions/mean_length": 13.125,
      "completions/mean_terminated_length": 13.125,
      "completions/min_length": 10.3,
      "completions/min_terminated_length": 10.3,
      "epoch": 0.5975794251134644,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004016244318336248,
      "kl": 1.1241649985313416,
      "learning_rate": 1.6951111111111113e-06,
      "loss": 0.0011,
      "num_tokens": 10831398.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8690
    },
    {
      "completion_length": 12.275,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 12.275,
      "completions/mean_terminated_length": 12.275,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.598267088433503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004033960402011871,
      "kl": 1.2029431879520416,
      "learning_rate": 1.6906666666666668e-06,
      "loss": 0.0012,
      "num_tokens": 10843609.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8700
    },
    {
      "completion_length": 16.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 29.7,
      "completions/max_terminated_length": 29.7,
      "completions/mean_length": 16.525,
      "completions/mean_terminated_length": 16.525,
      "completions/min_length": 10.7,
      "completions/min_terminated_length": 10.7,
      "epoch": 0.5989547517535415,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.002579750493168831,
      "kl": 1.306799578666687,
      "learning_rate": 1.6862222222222225e-06,
      "loss": 0.0013,
      "num_tokens": 10855890.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8710
    },
    {
      "completion_length": 17.6,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 38.9,
      "completions/max_terminated_length": 38.9,
      "completions/mean_length": 17.6,
      "completions/mean_terminated_length": 17.6,
      "completions/min_length": 9.3,
      "completions/min_terminated_length": 9.3,
      "epoch": 0.59964241507358,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004552737809717655,
      "kl": 1.1339510440826417,
      "learning_rate": 1.681777777777778e-06,
      "loss": 0.0011,
      "num_tokens": 10868374.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8720
    },
    {
      "completion_length": 13.025,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.8,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 13.025,
      "completions/mean_terminated_length": 13.025,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.6003300783936185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004149656742811203,
      "kl": 1.2595533192157746,
      "learning_rate": 1.6773333333333336e-06,
      "loss": 0.0013,
      "num_tokens": 10881307.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8730
    },
    {
      "completion_length": 11.925,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.3,
      "completions/max_terminated_length": 15.3,
      "completions/mean_length": 11.925,
      "completions/mean_terminated_length": 11.925,
      "completions/min_length": 9.4,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.601017741713657,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0024880890268832445,
      "kl": 1.2572630107402802,
      "learning_rate": 1.672888888888889e-06,
      "loss": 0.0013,
      "num_tokens": 10894100.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8740
    },
    {
      "completion_length": 13.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 17.6,
      "completions/max_terminated_length": 17.6,
      "completions/mean_length": 13.3,
      "completions/mean_terminated_length": 13.3,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.6017054050336955,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004873007535934448,
      "kl": 1.2138256490230561,
      "learning_rate": 1.6684444444444444e-06,
      "loss": 0.0012,
      "num_tokens": 10907060.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8750
    },
    {
      "completion_length": 18.75,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 38.8,
      "completions/max_terminated_length": 38.8,
      "completions/mean_length": 18.75,
      "completions/mean_terminated_length": 18.75,
      "completions/min_length": 10.4,
      "completions/min_terminated_length": 10.4,
      "epoch": 0.602393068353734,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.0031885248608887196,
      "kl": 1.0743233919143678,
      "learning_rate": 1.664e-06,
      "loss": 0.0011,
      "num_tokens": 10918878.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8760
    },
    {
      "completion_length": 11.8,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.7,
      "completions/max_terminated_length": 14.7,
      "completions/mean_length": 11.8,
      "completions/mean_terminated_length": 11.8,
      "completions/min_length": 8.9,
      "completions/min_terminated_length": 8.9,
      "epoch": 0.6030807316737725,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.013680394738912582,
      "kl": 1.296369630098343,
      "learning_rate": 1.6595555555555556e-06,
      "loss": 0.0013,
      "num_tokens": 10930078.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8770
    },
    {
      "completion_length": 12.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.6,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 12.85,
      "completions/mean_terminated_length": 12.85,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.6037683949938111,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003447858849540353,
      "kl": 1.0455607414245605,
      "learning_rate": 1.6551111111111112e-06,
      "loss": 0.001,
      "num_tokens": 10941140.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8780
    },
    {
      "completion_length": 18.3,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 39.6,
      "completions/max_terminated_length": 39.6,
      "completions/mean_length": 18.3,
      "completions/mean_terminated_length": 18.3,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.6044560583138495,
      "frac_reward_zero_std": 0.8,
      "grad_norm": 0.002344973385334015,
      "kl": 1.1524518728256226,
      "learning_rate": 1.6506666666666667e-06,
      "loss": 0.0012,
      "num_tokens": 10954176.0,
      "reward": 5.925,
      "reward_std": 0.15,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.475,
      "rewards/check_response_quality/std": 0.05,
      "rewards/match_format_approximately/mean": 0.975,
      "rewards/match_format_approximately/std": 0.05,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8790
    },
    {
      "completion_length": 11.375,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.8,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 11.375,
      "completions/mean_terminated_length": 11.375,
      "completions/min_length": 9.1,
      "completions/min_terminated_length": 9.1,
      "epoch": 0.6051437216338881,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.004940181504935026,
      "kl": 1.3828946709632874,
      "learning_rate": 1.6462222222222224e-06,
      "loss": 0.0014,
      "num_tokens": 10966583.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8800
    },
    {
      "completion_length": 17.85,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 36.0,
      "completions/max_terminated_length": 36.0,
      "completions/mean_length": 17.85,
      "completions/mean_terminated_length": 17.85,
      "completions/min_length": 10.1,
      "completions/min_terminated_length": 10.1,
      "epoch": 0.6058313849539265,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.006498556584119797,
      "kl": 1.036690926551819,
      "learning_rate": 1.641777777777778e-06,
      "loss": 0.001,
      "num_tokens": 10980065.0,
      "reward": 5.975,
      "reward_std": 0.05,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8810
    },
    {
      "completion_length": 12.225,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.9,
      "completions/max_terminated_length": 15.9,
      "completions/mean_length": 12.225,
      "completions/mean_terminated_length": 12.225,
      "completions/min_length": 9.9,
      "completions/min_terminated_length": 9.9,
      "epoch": 0.6065190482739651,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0031990488059818745,
      "kl": 1.0018371641635895,
      "learning_rate": 1.6373333333333336e-06,
      "loss": 0.001,
      "num_tokens": 10992706.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8820
    },
    {
      "completion_length": 12.525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 16.1,
      "completions/max_terminated_length": 16.1,
      "completions/mean_length": 12.525,
      "completions/mean_terminated_length": 12.525,
      "completions/min_length": 9.8,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.6072067115940035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0022266560699790716,
      "kl": 1.026029396057129,
      "learning_rate": 1.632888888888889e-06,
      "loss": 0.001,
      "num_tokens": 11005827.0,
      "reward": 6.0,
      "reward_std": 0.0,
      "rewards/check_coherence/mean": 1.5,
      "rewards/check_coherence/std": 0.0,
      "rewards/check_response_quality/mean": 2.5,
      "rewards/check_response_quality/std": 0.0,
      "rewards/match_format_approximately/mean": 1.0,
      "rewards/match_format_approximately/std": 0.0,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8830
    },
    {
      "completion_length": 23.075,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 57.9,
      "completions/max_terminated_length": 57.9,
      "completions/mean_length": 23.075,
      "completions/mean_terminated_length": 23.075,
      "completions/min_length": 9.7,
      "completions/min_terminated_length": 9.7,
      "epoch": 0.6078943749140421,
      "frac_reward_zero_std": 0.9,
      "grad_norm": 0.004883910529315472,
      "kl": 1.190100622177124,
      "learning_rate": 1.6284444444444448e-06,
      "loss": 0.0012,
      "num_tokens": 11018086.0,
      "reward": 5.95,
      "reward_std": 0.1,
      "rewards/check_coherence/mean": 1.475,
      "rewards/check_coherence/std": 0.05,
      "rewards/check_response_quality/mean": 2.4875,
      "rewards/check_response_quality/std": 0.025,
      "rewards/match_format_approximately/mean": 0.9875,
      "rewards/match_format_approximately/std": 0.025,
      "rewards/match_format_exactly/mean": 1.0,
      "rewards/match_format_exactly/std": 0.0,
      "step": 8840
    }
  ],
  "logging_steps": 10,
  "max_steps": 12500,
  "num_input_tokens_seen": 11018086,
  "num_train_epochs": 1,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}