{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.144,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3458.0,
      "completions/mean_length": 2937.041748046875,
      "completions/mean_terminated_length": 1580.51611328125,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.002285714285714286,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13600684702396393,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": 0.0728,
      "num_tokens": 298360.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.26059263944625854,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3564.0,
      "completions/mean_length": 3029.072998046875,
      "completions/mean_terminated_length": 1747.0,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.004571428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09845250844955444,
      "kl": 0.0,
      "learning_rate": 2e-09,
      "loss": 0.0361,
      "num_tokens": 605249.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.2350771129131317,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2760.0,
      "completions/mean_length": 2942.73974609375,
      "completions/mean_terminated_length": 1303.9630126953125,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 0.006857142857142857,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07778529822826385,
      "kl": 0.00041421254475911457,
      "learning_rate": 4e-09,
      "loss": 0.0024,
      "num_tokens": 903910.0,
      "reward": 0.3020833134651184,
      "reward_std": 0.0852636992931366,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3436.0,
      "completions/mean_length": 2905.010498046875,
      "completions/mean_terminated_length": 1336.310302734375,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.009142857142857144,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0983043983578682,
      "kl": 0.00046443939208984375,
      "learning_rate": 5.999999999999999e-09,
      "loss": 0.0618,
      "num_tokens": 1198265.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.2163209319114685,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3531.0,
      "completions/mean_length": 2463.61474609375,
      "completions/mean_terminated_length": 1245.8043212890625,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "epoch": 0.011428571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11297014355659485,
      "kl": 0.0006132125854492188,
      "learning_rate": 8e-09,
      "loss": 0.0303,
      "num_tokens": 1450600.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.19560697674751282,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3491.0,
      "completions/mean_length": 2930.34375,
      "completions/mean_terminated_length": 1492.300048828125,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.013714285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0906430333852768,
      "kl": 0.000526587168375651,
      "learning_rate": 1e-08,
      "loss": 0.0469,
      "num_tokens": 1749601.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3539.0,
      "completions/mean_length": 3116.46875,
      "completions/mean_terminated_length": 1788.679931640625,
      "completions/min_length": 536.0,
      "completions/min_terminated_length": 536.0,
      "epoch": 0.016,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11311256140470505,
      "kl": 0.0005582173665364584,
      "learning_rate": 1.1999999999999998e-08,
      "loss": 0.0692,
      "num_tokens": 2064634.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.21764282882213593,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3538.0,
      "completions/mean_length": 2796.9375,
      "completions/mean_terminated_length": 1741.121826171875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 0.018285714285714287,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11352546513080597,
      "kl": 0.0006198883056640625,
      "learning_rate": 1.4000000000000001e-08,
      "loss": 0.0384,
      "num_tokens": 2348632.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3575.0,
      "completions/mean_length": 2742.21875,
      "completions/mean_terminated_length": 1275.1142578125,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 0.02057142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10035016387701035,
      "kl": 0.000530242919921875,
      "learning_rate": 1.6e-08,
      "loss": 0.017,
      "num_tokens": 2628511.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3573.0,
      "completions/mean_length": 2841.84375,
      "completions/mean_terminated_length": 1357.53125,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 0.022857142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10540636628866196,
      "kl": 0.0006135304768880209,
      "learning_rate": 1.8e-08,
      "loss": 0.0248,
      "num_tokens": 2917540.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.17924454808235168,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3414.0,
      "completions/mean_length": 2882.89599609375,
      "completions/mean_terminated_length": 1340.466796875,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.025142857142857144,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1847194880247116,
      "kl": 0.0006192525227864584,
      "learning_rate": 2e-08,
      "loss": 0.0474,
      "num_tokens": 3209934.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.1618102639913559,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3319.0,
      "completions/mean_length": 2125.02099609375,
      "completions/mean_terminated_length": 990.25927734375,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.027428571428571427,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1230333223938942,
      "kl": 0.0005963643391927084,
      "learning_rate": 2.2e-08,
      "loss": 0.0569,
      "num_tokens": 3428624.0,
      "reward": 0.5833333730697632,
      "reward_std": 0.16661180555820465,
      "rewards/format_reward/mean": 0.5833333134651184,
      "rewards/format_reward/std": 0.4955945909023285,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3432.0,
      "completions/mean_length": 3292.947998046875,
      "completions/mean_terminated_length": 2253.476318359375,
      "completions/min_length": 710.0,
      "completions/min_terminated_length": 710.0,
      "epoch": 0.029714285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10567952692508698,
      "kl": 0.0005556742350260416,
      "learning_rate": 2.3999999999999997e-08,
      "loss": 0.0282,
      "num_tokens": 3761127.0,
      "reward": 0.2708333432674408,
      "reward_std": 0.2586348354816437,
      "rewards/format_reward/mean": 0.2708333432674408,
      "rewards/format_reward/std": 0.44672295451164246,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3397.0,
      "completions/mean_length": 3178.166748046875,
      "completions/mean_terminated_length": 1960.666748046875,
      "completions/min_length": 568.0,
      "completions/min_terminated_length": 568.0,
      "epoch": 0.032,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09415514767169952,
      "kl": 0.000602563222249349,
      "learning_rate": 2.6e-08,
      "loss": 0.0457,
      "num_tokens": 4083169.0,
      "reward": 0.28125,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3561.0,
      "completions/mean_length": 3266.09375,
      "completions/mean_terminated_length": 2130.71435546875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 0.03428571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.07530064880847931,
      "kl": 0.0005038579305013021,
      "learning_rate": 2.8000000000000003e-08,
      "loss": 0.0049,
      "num_tokens": 4414018.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3553.0,
      "completions/mean_length": 2674.17724609375,
      "completions/mean_terminated_length": 1643.04443359375,
      "completions/min_length": 602.0,
      "completions/min_terminated_length": 602.0,
      "epoch": 0.036571428571428574,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12493874132633209,
      "kl": 0.0005288124084472656,
      "learning_rate": 3e-08,
      "loss": 0.0587,
      "num_tokens": 4687161.0,
      "reward": 0.5,
      "reward_std": 0.29090970754623413,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3582.0,
      "completions/mean_length": 3098.822998046875,
      "completions/mean_terminated_length": 1720.919921875,
      "completions/min_length": 524.0,
      "completions/min_terminated_length": 524.0,
      "epoch": 0.038857142857142854,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09812473505735397,
      "kl": 0.0006046295166015625,
      "learning_rate": 3.2e-08,
      "loss": 0.0221,
      "num_tokens": 5001868.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.2163209617137909,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3520.0,
      "completions/mean_length": 2694.48974609375,
      "completions/mean_terminated_length": 1550.8333740234375,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.04114285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11484291404485703,
      "kl": 0.0005998611450195312,
      "learning_rate": 3.4e-08,
      "loss": 0.077,
      "num_tokens": 5276751.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.25055360794067383,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3443.0,
      "completions/mean_length": 2670.666748046875,
      "completions/mean_terminated_length": 1392.0,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 0.04342857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09073452651500702,
      "kl": 0.0005601247151692709,
      "learning_rate": 3.6e-08,
      "loss": 0.0491,
      "num_tokens": 5549443.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3475.0,
      "completions/mean_length": 2887.42724609375,
      "completions/mean_terminated_length": 1354.966796875,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.045714285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09556788951158524,
      "kl": 0.0005938212076822916,
      "learning_rate": 3.7999999999999996e-08,
      "loss": 0.0749,
      "num_tokens": 5842338.0,
      "reward": 0.3125,
      "reward_std": 0.17532894015312195,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3473.0,
      "completions/mean_length": 2903.916748046875,
      "completions/mean_terminated_length": 1663.7647705078125,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 0.048,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12257043272256851,
      "kl": 0.0005583763122558594,
      "learning_rate": 4e-08,
      "loss": 0.0543,
      "num_tokens": 6136450.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.24575206637382507,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3493.0,
      "completions/mean_length": 3011.75,
      "completions/mean_terminated_length": 1811.8709716796875,
      "completions/min_length": 628.0,
      "completions/min_terminated_length": 628.0,
      "epoch": 0.05028571428571429,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10206293314695358,
      "kl": 0.0005245208740234375,
      "learning_rate": 4.2e-08,
      "loss": 0.0345,
      "num_tokens": 6442168.0,
      "reward": 0.375,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3537.0,
      "completions/mean_length": 3026.73974609375,
      "completions/mean_terminated_length": 1962.87890625,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 0.052571428571428575,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08821089565753937,
      "kl": 0.0005661646525065104,
      "learning_rate": 4.4e-08,
      "loss": 0.0006,
      "num_tokens": 6749481.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.18208830058574677,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3436.0,
      "completions/mean_length": 2724.0,
      "completions/mean_terminated_length": 1749.3333740234375,
      "completions/min_length": 550.0,
      "completions/min_terminated_length": 550.0,
      "epoch": 0.054857142857142854,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1669473648071289,
      "kl": 0.0006227493286132812,
      "learning_rate": 4.6e-08,
      "loss": 0.0681,
      "num_tokens": 7026327.0,
      "reward": 0.53125,
      "reward_std": 0.3574172258377075,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3512.0,
      "completions/mean_length": 3240.1875,
      "completions/mean_terminated_length": 2263.760009765625,
      "completions/min_length": 565.0,
      "completions/min_terminated_length": 565.0,
      "epoch": 0.05714285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10814571380615234,
      "kl": 0.0005668004353841146,
      "learning_rate": 4.799999999999999e-08,
      "loss": 0.0206,
      "num_tokens": 7352895.0,
      "reward": 0.3125,
      "reward_std": 0.24468021094799042,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3561.0,
      "completions/mean_length": 2478.822998046875,
      "completions/mean_terminated_length": 931.5750122070312,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 0.05942857142857143,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08812437951564789,
      "kl": 0.0005440711975097656,
      "learning_rate": 5e-08,
      "loss": 0.039,
      "num_tokens": 7606390.0,
      "reward": 0.4375,
      "reward_std": 0.1430540829896927,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3575.0,
      "completions/mean_length": 3112.64599609375,
      "completions/mean_terminated_length": 1698.5833740234375,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 0.061714285714285715,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10746309161186218,
      "kl": 0.0005650520324707031,
      "learning_rate": 5.2e-08,
      "loss": 0.0464,
      "num_tokens": 7921542.0,
      "reward": 0.3125000298023224,
      "reward_std": 0.280870646238327,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3546.0,
      "completions/mean_length": 3297.375,
      "completions/mean_terminated_length": 2055.333251953125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.064,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10604382306337357,
      "kl": 0.0006319681803385416,
      "learning_rate": 5.4e-08,
      "loss": 0.056,
      "num_tokens": 8254734.0,
      "reward": 0.2604166865348816,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.2604166567325592,
      "rewards/format_reward/std": 0.4411657154560089,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3467.0,
      "completions/mean_length": 3198.21875,
      "completions/mean_terminated_length": 2261.321533203125,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.06628571428571428,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.08207737654447556,
      "kl": 0.000556786855061849,
      "learning_rate": 5.6000000000000005e-08,
      "loss": 0.0236,
      "num_tokens": 8578473.0,
      "reward": 0.34375,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3287.0,
      "completions/mean_length": 2793.53125,
      "completions/mean_terminated_length": 1415.857177734375,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.06857142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09903497248888016,
      "kl": 0.00063323974609375,
      "learning_rate": 5.7999999999999997e-08,
      "loss": 0.0259,
      "num_tokens": 8862576.0,
      "reward": 0.40625,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3232.0,
      "completions/mean_length": 2894.52099609375,
      "completions/mean_terminated_length": 1377.666748046875,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.07085714285714285,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11826644837856293,
      "kl": 0.0004975001017252604,
      "learning_rate": 6e-08,
      "loss": 0.0843,
      "num_tokens": 9156134.0,
      "reward": 0.375,
      "reward_std": 0.29090970754623413,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3519.0,
      "completions/mean_length": 2436.1875,
      "completions/mean_terminated_length": 1423.411865234375,
      "completions/min_length": 408.0,
      "completions/min_terminated_length": 408.0,
      "epoch": 0.07314285714285715,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12378033995628357,
      "kl": 0.0005936622619628906,
      "learning_rate": 6.2e-08,
      "loss": 0.035,
      "num_tokens": 9406298.0,
      "reward": 0.5833333730697632,
      "reward_std": 0.2163209319114685,
      "rewards/format_reward/mean": 0.5833333134651184,
      "rewards/format_reward/std": 0.4955945909023285,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3472.0,
      "completions/mean_length": 2883.979248046875,
      "completions/mean_terminated_length": 1860.871826171875,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 0.07542857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09643115103244781,
      "kl": 0.0005831718444824219,
      "learning_rate": 6.4e-08,
      "loss": 0.0298,
      "num_tokens": 9699252.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.20833333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3394.0,
      "completions/mean_length": 3245.291748046875,
      "completions/mean_terminated_length": 1872.631591796875,
      "completions/min_length": 671.0,
      "completions/min_terminated_length": 671.0,
      "epoch": 0.07771428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1022980734705925,
      "kl": 0.0005474090576171875,
      "learning_rate": 6.6e-08,
      "loss": 0.0465,
      "num_tokens": 10027630.0,
      "reward": 0.2604166865348816,
      "reward_std": 0.2466379851102829,
      "rewards/format_reward/mean": 0.2604166567325592,
      "rewards/format_reward/std": 0.4411657154560089,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3362.0,
      "completions/mean_length": 2790.885498046875,
      "completions/mean_terminated_length": 1526.189208984375,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09115320444107056,
      "kl": 0.0005726814270019531,
      "learning_rate": 6.8e-08,
      "loss": 0.0514,
      "num_tokens": 10311023.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3487.0,
      "completions/mean_length": 3176.09375,
      "completions/mean_terminated_length": 2017.6400146484375,
      "completions/min_length": 570.0,
      "completions/min_terminated_length": 570.0,
      "epoch": 0.08228571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09013529121875763,
      "kl": 0.0005197525024414062,
      "learning_rate": 6.999999999999999e-08,
      "loss": 0.0358,
      "num_tokens": 10632122.0,
      "reward": 0.2812500298023224,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.29166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3568.0,
      "completions/mean_length": 3280.61474609375,
      "completions/mean_terminated_length": 1870.7647705078125,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 0.08457142857142858,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09990935772657394,
      "kl": 0.0005869865417480469,
      "learning_rate": 7.2e-08,
      "loss": 0.0076,
      "num_tokens": 10963327.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.243794247508049,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.08333333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3414.0,
      "completions/mean_length": 3143.30224609375,
      "completions/mean_terminated_length": 1660.95458984375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.08685714285714285,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1052294448018074,
      "kl": 0.0005707740783691406,
      "learning_rate": 7.4e-08,
      "loss": 0.0365,
      "num_tokens": 11280684.0,
      "reward": 0.2604166865348816,
      "reward_std": 0.25187548995018005,
      "rewards/format_reward/mean": 0.2604166567325592,
      "rewards/format_reward/std": 0.4411657154560089,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2451.0,
      "completions/mean_length": 2676.33349609375,
      "completions/mean_terminated_length": 861.0,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 0.08914285714285715,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.11297257989645004,
      "kl": 0.0005629857381184896,
      "learning_rate": 7.599999999999999e-08,
      "loss": 0.0573,
      "num_tokens": 11553614.0,
      "reward": 0.34375,
      "reward_std": 0.10882141441106796,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3579.0,
      "completions/mean_length": 2672.42724609375,
      "completions/mean_terminated_length": 1449.5853271484375,
      "completions/min_length": 290.0,
      "completions/min_terminated_length": 290.0,
      "epoch": 0.09142857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.1159047782421112,
      "kl": 0.0005908012390136719,
      "learning_rate": 7.8e-08,
      "loss": 0.0235,
      "num_tokens": 11826007.0,
      "reward": 0.4375,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3220.0,
      "completions/mean_length": 3086.229248046875,
      "completions/mean_terminated_length": 1592.916748046875,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.09371428571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08972278982400894,
      "kl": 0.0006130536397298177,
      "learning_rate": 8e-08,
      "loss": 0.0427,
      "num_tokens": 12138461.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569157063961029,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3523.0,
      "completions/mean_length": 2744.979248046875,
      "completions/mean_terminated_length": 1666.2381591796875,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 0.096,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.12443576008081436,
      "kl": 0.0005466143290201823,
      "learning_rate": 8.199999999999999e-08,
      "loss": 0.0248,
      "num_tokens": 12418449.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.1430540680885315,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.41666666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3558.0,
      "completions/mean_length": 3242.52099609375,
      "completions/mean_terminated_length": 1242.4285888671875,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.09828571428571428,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09909730404615402,
      "kl": 0.0005853970845540365,
      "learning_rate": 8.4e-08,
      "loss": 0.0365,
      "num_tokens": 12746813.0,
      "reward": 0.1666666716337204,
      "reward_std": 0.1988866627216339,
      "rewards/format_reward/mean": 0.1666666716337204,
      "rewards/format_reward/std": 0.374634325504303,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3469.0,
      "completions/mean_length": 2681.5,
      "completions/mean_terminated_length": 1779.0,
      "completions/min_length": 503.0,
      "completions/min_terminated_length": 503.0,
      "epoch": 0.10057142857142858,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12858478724956512,
      "kl": 0.0006157557169596354,
      "learning_rate": 8.599999999999999e-08,
      "loss": 0.0533,
      "num_tokens": 13020377.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3498.0,
      "completions/mean_length": 3165.58349609375,
      "completions/mean_terminated_length": 2198.896484375,
      "completions/min_length": 610.0,
      "completions/min_terminated_length": 610.0,
      "epoch": 0.10285714285714286,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.12761904299259186,
      "kl": 0.000564416249593099,
      "learning_rate": 8.8e-08,
      "loss": 0.0308,
      "num_tokens": 13340059.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.31838303804397583,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3209.0,
      "completions/mean_length": 2985.55224609375,
      "completions/mean_terminated_length": 1730.741943359375,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.10514285714285715,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.1372997909784317,
      "kl": 0.0005257924397786459,
      "learning_rate": 9e-08,
      "loss": 0.0533,
      "num_tokens": 13643196.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.292867511510849,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2885.96875,
      "completions/mean_terminated_length": 1772.8919677734375,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 0.10742857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09933201223611832,
      "kl": 0.0005814234415690104,
      "learning_rate": 9.2e-08,
      "loss": 0.0661,
      "num_tokens": 13936977.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2083333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2784.0,
      "completions/mean_length": 2161.541748046875,
      "completions/mean_terminated_length": 1007.4717407226562,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.10971428571428571,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.12496917694807053,
      "kl": 0.0005776087443033854,
      "learning_rate": 9.4e-08,
      "loss": 0.039,
      "num_tokens": 14159347.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3555.0,
      "completions/mean_length": 2724.27099609375,
      "completions/mean_terminated_length": 1664.6046142578125,
      "completions/min_length": 545.0,
      "completions/min_terminated_length": 545.0,
      "epoch": 0.112,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1356138288974762,
      "kl": 0.0005715688069661459,
      "learning_rate": 9.599999999999999e-08,
      "loss": 0.0539,
      "num_tokens": 14436279.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.2828284502029419,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3563.0,
      "completions/mean_length": 3031.697998046875,
      "completions/mean_terminated_length": 1755.689697265625,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 0.11428571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10621047019958496,
      "kl": 0.0006009737650553385,
      "learning_rate": 9.799999999999999e-08,
      "loss": 0.0136,
      "num_tokens": 14743666.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.41666666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3039.0,
      "completions/mean_length": 3248.041748046875,
      "completions/mean_terminated_length": 1280.2857666015625,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 0.11657142857142858,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11713991314172745,
      "kl": 0.0005950927734375,
      "learning_rate": 1e-07,
      "loss": 0.0354,
      "num_tokens": 15071078.0,
      "reward": 0.1770833432674408,
      "reward_std": 0.20084446668624878,
      "rewards/format_reward/mean": 0.1770833283662796,
      "rewards/format_reward/std": 0.3837430775165558,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2916666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3584.0,
      "completions/mean_length": 2244.541748046875,
      "completions/mean_terminated_length": 1246.036376953125,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 0.11885714285714286,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10812873393297195,
      "kl": 0.0005008379618326823,
      "learning_rate": 9.999890338174275e-08,
      "loss": -0.0157,
      "num_tokens": 15303192.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3534.0,
      "completions/mean_length": 2764.20849609375,
      "completions/mean_terminated_length": 1512.9473876953125,
      "completions/min_length": 507.0,
      "completions/min_terminated_length": 507.0,
      "epoch": 0.12114285714285715,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08318143337965012,
      "kl": 0.0005362828572591146,
      "learning_rate": 9.999561358041868e-08,
      "loss": 0.0381,
      "num_tokens": 15584606.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.15372902154922485,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3549.0,
      "completions/mean_length": 2930.70849609375,
      "completions/mean_terminated_length": 1683.5152587890625,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.12342857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09070580452680588,
      "kl": 0.0005499521891276041,
      "learning_rate": 9.999013075636804e-08,
      "loss": 0.0242,
      "num_tokens": 15882172.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3567.0,
      "completions/mean_length": 3003.40625,
      "completions/mean_terminated_length": 1593.3929443359375,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 0.12571428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10172918438911438,
      "kl": 0.0005849202473958334,
      "learning_rate": 9.998245517681594e-08,
      "loss": 0.0171,
      "num_tokens": 16186891.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.16856960952281952,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3470.0,
      "completions/mean_length": 2583.979248046875,
      "completions/mean_terminated_length": 1624.7755126953125,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.128,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.07545579224824905,
      "kl": 0.0005804697672526041,
      "learning_rate": 9.997258721585931e-08,
      "loss": 0.0244,
      "num_tokens": 16451051.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.11077921092510223,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3485.0,
      "completions/mean_length": 3133.73974609375,
      "completions/mean_terminated_length": 1921.5001220703125,
      "completions/min_length": 634.0,
      "completions/min_terminated_length": 634.0,
      "epoch": 0.13028571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0956675112247467,
      "kl": 0.0006281534830729166,
      "learning_rate": 9.996052735444862e-08,
      "loss": 0.0399,
      "num_tokens": 16767484.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3497.0,
      "completions/mean_length": 3043.229248046875,
      "completions/mean_terminated_length": 1853.533447265625,
      "completions/min_length": 467.0,
      "completions/min_terminated_length": 467.0,
      "epoch": 0.13257142857142856,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09832925349473953,
      "kl": 0.0006004969278971354,
      "learning_rate": 9.994627618036453e-08,
      "loss": 0.045,
      "num_tokens": 17076140.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.2350771129131317,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3482.0,
      "completions/mean_length": 3199.604248046875,
      "completions/mean_terminated_length": 2107.919921875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.13485714285714287,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09223364293575287,
      "kl": 0.0006233851114908854,
      "learning_rate": 9.992983438818914e-08,
      "loss": 0.0386,
      "num_tokens": 17399472.0,
      "reward": 0.3125,
      "reward_std": 0.1975647658109665,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3352.0,
      "completions/mean_length": 3025.23974609375,
      "completions/mean_terminated_length": 1520.8846435546875,
      "completions/min_length": 582.0,
      "completions/min_terminated_length": 582.0,
      "epoch": 0.13714285714285715,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.1303519904613495,
      "kl": 0.0006186167399088541,
      "learning_rate": 9.991120277927222e-08,
      "loss": 0.0723,
      "num_tokens": 17706173.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.3015846610069275,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3575.0,
      "completions/mean_length": 3220.166748046875,
      "completions/mean_terminated_length": 2290.370361328125,
      "completions/min_length": 741.0,
      "completions/min_terminated_length": 741.0,
      "epoch": 0.13942857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1127300038933754,
      "kl": 0.0005509058634440104,
      "learning_rate": 9.989038226169209e-08,
      "loss": 0.0621,
      "num_tokens": 18031425.0,
      "reward": 0.375,
      "reward_std": 0.24859580397605896,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3536.0,
      "completions/mean_length": 3095.354248046875,
      "completions/mean_terminated_length": 1544.434814453125,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.1417142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09086631238460541,
      "kl": 0.0006039937337239584,
      "learning_rate": 9.98673738502114e-08,
      "loss": 0.0153,
      "num_tokens": 18344623.0,
      "reward": 0.25,
      "reward_std": 0.1430540829896927,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.435285747051239,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3220.0,
      "completions/mean_length": 2315.14599609375,
      "completions/mean_terminated_length": 1098.0816650390625,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.144,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10809347033500671,
      "kl": 0.0006092389424641927,
      "learning_rate": 9.984217866622769e-08,
      "loss": 0.0626,
      "num_tokens": 18581997.0,
      "reward": 0.53125,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3477.0,
      "completions/mean_length": 2666.30224609375,
      "completions/mean_terminated_length": 1581.75,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.1462857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11078927665948868,
      "kl": 0.0005582173665364584,
      "learning_rate": 9.981479793771865e-08,
      "loss": 0.0651,
      "num_tokens": 18853448.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.2350771129131317,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.20833333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3476.0,
      "completions/mean_length": 3188.760498046875,
      "completions/mean_terminated_length": 1587.0,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 0.14857142857142858,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.07864987850189209,
      "kl": 0.000557104746500651,
      "learning_rate": 9.978523299918239e-08,
      "loss": 0.0321,
      "num_tokens": 19176051.0,
      "reward": 0.2083333432674408,
      "reward_std": 0.17532894015312195,
      "rewards/format_reward/mean": 0.2083333283662796,
      "rewards/format_reward/std": 0.40824830532073975,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3388.0,
      "completions/mean_length": 2840.61474609375,
      "completions/mean_terminated_length": 1353.84375,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 0.15085714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.1085980162024498,
      "kl": 0.0005858739217122396,
      "learning_rate": 9.975348529157229e-08,
      "loss": 0.0418,
      "num_tokens": 19464632.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.2163209319114685,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3323.0,
      "completions/mean_length": 2540.041748046875,
      "completions/mean_terminated_length": 1451.6595458984375,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 0.15314285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11475242674350739,
      "kl": 0.0005539258321126302,
      "learning_rate": 9.971955636222684e-08,
      "loss": 0.0616,
      "num_tokens": 19723902.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.19080540537834167,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2818.17724609375,
      "completions/mean_terminated_length": 1483.4571533203125,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 0.15542857142857142,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09802790731191635,
      "kl": 0.0005327860514322916,
      "learning_rate": 9.968344786479416e-08,
      "loss": 0.0134,
      "num_tokens": 20010815.0,
      "reward": 0.40625,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3559.0,
      "completions/mean_length": 2808.08349609375,
      "completions/mean_terminated_length": 1623.7894287109375,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.15771428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10697630792856216,
      "kl": 0.0005957285563151041,
      "learning_rate": 9.964516155915151e-08,
      "loss": 0.0126,
      "num_tokens": 20297635.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.1618102639913559,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3463.0,
      "completions/mean_length": 3381.229248046875,
      "completions/mean_terminated_length": 2286.266845703125,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 0.16,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10971253365278244,
      "kl": 0.0006380081176757812,
      "learning_rate": 9.960469931131937e-08,
      "loss": 0.0533,
      "num_tokens": 20637767.0,
      "reward": 0.2291666865348816,
      "reward_std": 0.2808706760406494,
      "rewards/format_reward/mean": 0.2291666716337204,
      "rewards/format_reward/std": 0.42250296473503113,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3478.0,
      "completions/mean_length": 3010.729248046875,
      "completions/mean_terminated_length": 1618.5001220703125,
      "completions/min_length": 823.0,
      "completions/min_terminated_length": 823.0,
      "epoch": 0.16228571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09700077027082443,
      "kl": 0.0005467732747395834,
      "learning_rate": 9.956206309337066e-08,
      "loss": 0.0285,
      "num_tokens": 20943861.0,
      "reward": 0.34375,
      "reward_std": 0.2283177673816681,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3501.0,
      "completions/mean_length": 3115.479248046875,
      "completions/mean_terminated_length": 1628.434814453125,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.16457142857142856,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.14516979455947876,
      "kl": 0.0005690256754557291,
      "learning_rate": 9.951725498333447e-08,
      "loss": 0.0499,
      "num_tokens": 21259273.0,
      "reward": 0.3125,
      "reward_std": 0.27606913447380066,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2583.729248046875,
      "completions/mean_terminated_length": 1401.5909423828125,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 0.16685714285714287,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.06490742415189743,
      "kl": 0.000545501708984375,
      "learning_rate": 9.947027716509488e-08,
      "loss": 0.0093,
      "num_tokens": 21523775.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.09202303737401962,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.4583333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3471.0,
      "completions/mean_length": 2347.875,
      "completions/mean_terminated_length": 1572.677978515625,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.16914285714285715,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0859948992729187,
      "kl": 0.00044027964274088543,
      "learning_rate": 9.942113192828445e-08,
      "loss": 0.0131,
      "num_tokens": 21764615.0,
      "reward": 0.6458333730697632,
      "reward_std": 0.15177123248577118,
      "rewards/format_reward/mean": 0.6458333134651184,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2864.197998046875,
      "completions/mean_terminated_length": 2013.5228271484375,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.17142857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11670469492673874,
      "kl": 0.0005636215209960938,
      "learning_rate": 9.936982166817271e-08,
      "loss": 0.0381,
      "num_tokens": 22055946.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.2350771278142929,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3542.0,
      "completions/mean_length": 2683.17724609375,
      "completions/mean_terminated_length": 1572.8604736328125,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 0.1737142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11684764176607132,
      "kl": 0.0005763371785481771,
      "learning_rate": 9.931634888554937e-08,
      "loss": 0.0463,
      "num_tokens": 22329821.0,
      "reward": 0.46875,
      "reward_std": 0.2221943587064743,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3534.0,
      "completions/mean_length": 2772.15625,
      "completions/mean_terminated_length": 1357.2286376953125,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.176,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10801287740468979,
      "kl": 0.0005984306335449219,
      "learning_rate": 9.926071618660237e-08,
      "loss": 0.0288,
      "num_tokens": 22611692.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.1888476312160492,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3390.0,
      "completions/mean_length": 3108.86474609375,
      "completions/mean_terminated_length": 2112.61279296875,
      "completions/min_length": 533.0,
      "completions/min_terminated_length": 533.0,
      "epoch": 0.1782857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12827573716640472,
      "kl": 0.0006354649861653646,
      "learning_rate": 9.920292628279099e-08,
      "loss": 0.0595,
      "num_tokens": 22927237.0,
      "reward": 0.34375,
      "reward_std": 0.292867511510849,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3283.0,
      "completions/mean_length": 2821.23974609375,
      "completions/mean_terminated_length": 1491.857177734375,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 0.18057142857142858,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1304936707019806,
      "kl": 0.0006202061971028646,
      "learning_rate": 9.914298199071361e-08,
      "loss": 0.0767,
      "num_tokens": 23213934.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3472.0,
      "completions/mean_length": 2843.45849609375,
      "completions/mean_terminated_length": 1662.5946044921875,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 0.18285714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12500052154064178,
      "kl": 0.0005324681599934896,
      "learning_rate": 9.908088623197048e-08,
      "loss": 0.0331,
      "num_tokens": 23503124.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3370.0,
      "completions/mean_length": 2606.80224609375,
      "completions/mean_terminated_length": 1350.40478515625,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.18514285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09482422471046448,
      "kl": 0.0005370775858561198,
      "learning_rate": 9.901664203302125e-08,
      "loss": 0.0333,
      "num_tokens": 23768887.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.13629475235939026,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3567.0,
      "completions/mean_length": 2587.89599609375,
      "completions/mean_terminated_length": 1410.681884765625,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 0.18742857142857142,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09811008721590042,
      "kl": 0.0005925496419270834,
      "learning_rate": 9.895025252503755e-08,
      "loss": 0.0505,
      "num_tokens": 24033021.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.15177121758460999,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3414.0,
      "completions/mean_length": 3143.73974609375,
      "completions/mean_terminated_length": 1571.3809814453125,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 0.18971428571428572,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07115814089775085,
      "kl": 0.0005896886189778646,
      "learning_rate": 9.888172094375033e-08,
      "loss": 0.038,
      "num_tokens": 24351236.0,
      "reward": 0.21875,
      "reward_std": 0.09006524085998535,
      "rewards/format_reward/mean": 0.21875,
      "rewards/format_reward/std": 0.4155687391757965,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3340.0,
      "completions/mean_length": 2987.479248046875,
      "completions/mean_terminated_length": 2077.0,
      "completions/min_length": 469.0,
      "completions/min_terminated_length": 469.0,
      "epoch": 0.192,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.12423859536647797,
      "kl": 0.0005726814270019531,
      "learning_rate": 9.88110506292922e-08,
      "loss": 0.0804,
      "num_tokens": 24654432.0,
      "reward": 0.40625,
      "reward_std": 0.32514238357543945,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3471.0,
      "completions/mean_length": 2784.9375,
      "completions/mean_terminated_length": 1840.5909423828125,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 0.19428571428571428,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1060919538140297,
      "kl": 0.0005359649658203125,
      "learning_rate": 9.873824502603459e-08,
      "loss": 0.0599,
      "num_tokens": 24938262.0,
      "reward": 0.53125,
      "reward_std": 0.26059263944625854,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3472.0,
      "completions/mean_length": 2932.885498046875,
      "completions/mean_terminated_length": 1567.6451416015625,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 0.19657142857142856,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.14066024124622345,
      "kl": 0.0005710919698079427,
      "learning_rate": 9.866330768241983e-08,
      "loss": 0.0289,
      "num_tokens": 25235827.0,
      "reward": 0.375,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3368.0,
      "completions/mean_length": 3057.604248046875,
      "completions/mean_terminated_length": 1640.3846435546875,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.19885714285714284,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11923669278621674,
      "kl": 0.0005593299865722656,
      "learning_rate": 9.85862422507884e-08,
      "loss": 0.0618,
      "num_tokens": 25545647.0,
      "reward": 0.2604166865348816,
      "reward_std": 0.18688982725143433,
      "rewards/format_reward/mean": 0.2604166567325592,
      "rewards/format_reward/std": 0.4411657154560089,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3567.0,
      "completions/mean_length": 2599.48974609375,
      "completions/mean_terminated_length": 1730.803955078125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.20114285714285715,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10778623074293137,
      "kl": 0.0005170504252115885,
      "learning_rate": 9.850705248720067e-08,
      "loss": 0.0493,
      "num_tokens": 25811650.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3487.0,
      "completions/mean_length": 2505.041748046875,
      "completions/mean_terminated_length": 1470.1224365234375,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "epoch": 0.20342857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13670764863491058,
      "kl": 0.0005366007486979166,
      "learning_rate": 9.842574225125401e-08,
      "loss": 0.0698,
      "num_tokens": 26068160.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3577.0,
      "completions/mean_length": 2809.84375,
      "completions/mean_terminated_length": 1726.0250244140625,
      "completions/min_length": 623.0,
      "completions/min_terminated_length": 623.0,
      "epoch": 0.2057142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0981605127453804,
      "kl": 0.0005105336507161459,
      "learning_rate": 9.834231550589461e-08,
      "loss": 0.0202,
      "num_tokens": 26354153.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3514.0,
      "completions/mean_length": 2839.354248046875,
      "completions/mean_terminated_length": 1278.0,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.208,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09133773297071457,
      "kl": 0.0005725224812825521,
      "learning_rate": 9.825677631722435e-08,
      "loss": 0.0475,
      "num_tokens": 26642895.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3070.0,
      "completions/mean_length": 2715.947998046875,
      "completions/mean_terminated_length": 1391.0263671875,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.2102857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11354603618383408,
      "kl": 0.0005235671997070312,
      "learning_rate": 9.816912885430258e-08,
      "loss": 0.049,
      "num_tokens": 26920108.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.45833333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2996.0,
      "completions/mean_length": 3379.90625,
      "completions/mean_terminated_length": 2076.84619140625,
      "completions/min_length": 676.0,
      "completions/min_terminated_length": 676.0,
      "epoch": 0.21257142857142858,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09983745962381363,
      "kl": 0.0006497701009114584,
      "learning_rate": 9.807937738894302e-08,
      "loss": 0.0502,
      "num_tokens": 27260623.0,
      "reward": 0.1666666716337204,
      "reward_std": 0.2350771278142929,
      "rewards/format_reward/mean": 0.1666666716337204,
      "rewards/format_reward/std": 0.374634325504303,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3545.0,
      "completions/mean_length": 3095.135498046875,
      "completions/mean_terminated_length": 1706.760009765625,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 0.21485714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11047376692295074,
      "kl": 0.0006453196207682291,
      "learning_rate": 9.798752629550545e-08,
      "loss": 0.0327,
      "num_tokens": 27574148.0,
      "reward": 0.3125,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3545.0,
      "completions/mean_length": 2959.28125,
      "completions/mean_terminated_length": 1584.9000244140625,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 0.21714285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08645107597112656,
      "kl": 0.0006707509358723959,
      "learning_rate": 9.789358005068261e-08,
      "loss": 0.0238,
      "num_tokens": 27874355.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.13629475235939026,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.3333333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3454.0,
      "completions/mean_length": 2239.510498046875,
      "completions/mean_terminated_length": 1279.1607666015625,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.21942857142857142,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11768729984760284,
      "kl": 0.0005092620849609375,
      "learning_rate": 9.779754323328191e-08,
      "loss": 0.0247,
      "num_tokens": 28104798.0,
      "reward": 0.6041667461395264,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.6041666865348816,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3267.0,
      "completions/mean_length": 2639.822998046875,
      "completions/mean_terminated_length": 1198.7105712890625,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.22171428571428572,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09902984648942947,
      "kl": 0.0006221135457356771,
      "learning_rate": 9.769942052400234e-08,
      "loss": 0.0299,
      "num_tokens": 28374433.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.15461495518684387,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3412.0,
      "completions/mean_length": 2780.39599609375,
      "completions/mean_terminated_length": 1498.9730224609375,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.224,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13889165222644806,
      "kl": 0.0006262461344401041,
      "learning_rate": 9.759921670520633e-08,
      "loss": 0.0847,
      "num_tokens": 28656483.0,
      "reward": 0.40625,
      "reward_std": 0.26191452145576477,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3414.0,
      "completions/mean_length": 3180.83349609375,
      "completions/mean_terminated_length": 2035.8399658203125,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 0.22628571428571428,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11569508910179138,
      "kl": 0.0006573994954427084,
      "learning_rate": 9.749693666068663e-08,
      "loss": 0.0812,
      "num_tokens": 28977947.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.3015846312046051,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3539.0,
      "completions/mean_length": 3071.260498046875,
      "completions/mean_terminated_length": 1690.8077392578125,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.22857142857142856,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11107376217842102,
      "kl": 0.0005807876586914062,
      "learning_rate": 9.739258537542834e-08,
      "loss": 0.0475,
      "num_tokens": 29288514.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.27063167095184326,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3528.0,
      "completions/mean_length": 2619.375,
      "completions/mean_terminated_length": 1613.7020263671875,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.23085714285714284,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11716129630804062,
      "kl": 0.0005361239115397135,
      "learning_rate": 9.728616793536587e-08,
      "loss": 0.059,
      "num_tokens": 29556156.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.24183645844459534,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3443.0,
      "completions/mean_length": 2519.625,
      "completions/mean_terminated_length": 1029.5,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 0.23314285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12329067289829254,
      "kl": 0.0005308787027994791,
      "learning_rate": 9.717768952713512e-08,
      "loss": 0.0208,
      "num_tokens": 29812920.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3445.0,
      "completions/mean_length": 2608.0,
      "completions/mean_terminated_length": 1632.0,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 0.23542857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10539143532514572,
      "kl": 0.0005432764689127604,
      "learning_rate": 9.706715543782063e-08,
      "loss": 0.0549,
      "num_tokens": 30078750.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2083333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3516.0,
      "completions/mean_length": 2404.65625,
      "completions/mean_terminated_length": 1447.8302001953125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.2377142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.111219622194767,
      "kl": 0.0005749066670735677,
      "learning_rate": 9.695457105469804e-08,
      "loss": 0.0424,
      "num_tokens": 30325539.0,
      "reward": 0.5625,
      "reward_std": 0.1343369334936142,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.29166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3510.0,
      "completions/mean_length": 3327.0625,
      "completions/mean_terminated_length": 2133.058837890625,
      "completions/min_length": 615.0,
      "completions/min_terminated_length": 615.0,
      "epoch": 0.24,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11259549856185913,
      "kl": 0.000629425048828125,
      "learning_rate": 9.683994186497132e-08,
      "loss": 0.0331,
      "num_tokens": 30662145.0,
      "reward": 0.1979166716337204,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.1979166716337204,
      "rewards/format_reward/std": 0.4005205035209656,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3529.0,
      "completions/mean_length": 2797.90625,
      "completions/mean_terminated_length": 1427.857177734375,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 0.2422857142857143,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.13242016732692719,
      "kl": 0.0005540847778320312,
      "learning_rate": 9.672327345550542e-08,
      "loss": 0.0805,
      "num_tokens": 30947184.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.31838300824165344,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3464.0,
      "completions/mean_length": 2729.875,
      "completions/mean_terminated_length": 1306.3333740234375,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.24457142857142858,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08758668601512909,
      "kl": 0.0005545616149902344,
      "learning_rate": 9.660457151255409e-08,
      "loss": 0.0443,
      "num_tokens": 31225488.0,
      "reward": 0.375,
      "reward_std": 0.1430540680885315,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3501.0,
      "completions/mean_length": 2560.166748046875,
      "completions/mean_terminated_length": 1447.3043212890625,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 0.24685714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1443079113960266,
      "kl": 0.000594933827718099,
      "learning_rate": 9.648384182148251e-08,
      "loss": 0.0789,
      "num_tokens": 31486792.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.24859580397605896,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3529.0,
      "completions/mean_length": 3014.83349609375,
      "completions/mean_terminated_length": 1632.571533203125,
      "completions/min_length": 619.0,
      "completions/min_terminated_length": 619.0,
      "epoch": 0.24914285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08680311590433121,
      "kl": 0.0006758371988932291,
      "learning_rate": 9.636109026648555e-08,
      "loss": 0.0102,
      "num_tokens": 31792914.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.19080543518066406,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2862.0,
      "completions/mean_length": 2426.104248046875,
      "completions/mean_terminated_length": 1167.521728515625,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.25142857142857145,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11593201011419296,
      "kl": 0.0005362828572591146,
      "learning_rate": 9.623632283030077e-08,
      "loss": 0.0289,
      "num_tokens": 32041420.0,
      "reward": 0.5,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3523.0,
      "completions/mean_length": 2867.08349609375,
      "completions/mean_terminated_length": 1034.9630126953125,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 0.2537142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09556243568658829,
      "kl": 0.0005575815836588541,
      "learning_rate": 9.610954559391703e-08,
      "loss": 0.0358,
      "num_tokens": 32332386.0,
      "reward": 0.3125,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.4166666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3519.0,
      "completions/mean_length": 2368.791748046875,
      "completions/mean_terminated_length": 1572.6207275390625,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 0.256,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10844600200653076,
      "kl": 0.0005437533060709635,
      "learning_rate": 9.598076473627796e-08,
      "loss": 0.0255,
      "num_tokens": 32575294.0,
      "reward": 0.65625,
      "reward_std": 0.2379208505153656,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3578.0,
      "completions/mean_length": 2600.166748046875,
      "completions/mean_terminated_length": 1574.468017578125,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.2582857142857143,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13194780051708221,
      "kl": 0.0005145072937011719,
      "learning_rate": 9.584998653398089e-08,
      "loss": 0.0817,
      "num_tokens": 32840822.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.26735198497772217,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3549.0,
      "completions/mean_length": 3054.05224609375,
      "completions/mean_terminated_length": 1767.0357666015625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.26057142857142856,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10078351199626923,
      "kl": 0.0005081494649251302,
      "learning_rate": 9.571721736097087e-08,
      "loss": 0.0219,
      "num_tokens": 33149899.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.20564600825309753,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3479.0,
      "completions/mean_length": 2923.77099609375,
      "completions/mean_terminated_length": 1719.823486328125,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 0.26285714285714284,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.15436427295207977,
      "kl": 0.0006771087646484375,
      "learning_rate": 9.558246368823013e-08,
      "loss": 0.0573,
      "num_tokens": 33447255.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.2586348056793213,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3531.0,
      "completions/mean_length": 2840.75,
      "completions/mean_terminated_length": 1800.2000732421875,
      "completions/min_length": 613.0,
      "completions/min_terminated_length": 613.0,
      "epoch": 0.2651428571428571,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.1236443817615509,
      "kl": 0.0005249977111816406,
      "learning_rate": 9.544573208346251e-08,
      "loss": 0.0751,
      "num_tokens": 33735843.0,
      "reward": 0.4375,
      "reward_std": 0.30966588854789734,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3573.0,
      "completions/mean_length": 2401.322998046875,
      "completions/mean_terminated_length": 1400.59619140625,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.2674285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1455051302909851,
      "kl": 0.0005470911661783854,
      "learning_rate": 9.530702921077357e-08,
      "loss": 0.0647,
      "num_tokens": 33982030.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.22831778228282928,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3544.0,
      "completions/mean_length": 3229.52099609375,
      "completions/mean_terminated_length": 2275.154052734375,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.26971428571428574,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11932012438774109,
      "kl": 0.0005947748819986979,
      "learning_rate": 9.516636183034564e-08,
      "loss": 0.0577,
      "num_tokens": 34308324.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.28219255805015564,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.45691564679145813,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3545.0,
      "completions/mean_length": 2713.9375,
      "completions/mean_terminated_length": 1641.534912109375,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 0.272,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10066317766904831,
      "kl": 0.0005396207173665365,
      "learning_rate": 9.502373679810839e-08,
      "loss": 0.0423,
      "num_tokens": 34585440.0,
      "reward": 0.46875,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3298.0,
      "completions/mean_length": 2630.80224609375,
      "completions/mean_terminated_length": 1637.04248046875,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.2742857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.13681873679161072,
      "kl": 0.0005944569905598959,
      "learning_rate": 9.487916106540466e-08,
      "loss": 0.0358,
      "num_tokens": 34853531.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.19408510625362396,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3577.0,
      "completions/mean_length": 3081.822998046875,
      "completions/mean_terminated_length": 2123.121337890625,
      "completions/min_length": 267.0,
      "completions/min_terminated_length": 267.0,
      "epoch": 0.2765714285714286,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10625406354665756,
      "kl": 0.0005485216776529948,
      "learning_rate": 9.473264167865172e-08,
      "loss": 0.0282,
      "num_tokens": 35165340.0,
      "reward": 0.40625,
      "reward_std": 0.2780269384384155,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3012.0,
      "completions/mean_length": 2615.510498046875,
      "completions/mean_terminated_length": 1137.2894287109375,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.27885714285714286,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.04336674138903618,
      "kl": 0.0005785624186197916,
      "learning_rate": 9.458418577899774e-08,
      "loss": 0.0144,
      "num_tokens": 35432413.0,
      "reward": 0.40625,
      "reward_std": 0.05779037997126579,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3531.0,
      "completions/mean_length": 2774.20849609375,
      "completions/mean_terminated_length": 1590.666748046875,
      "completions/min_length": 575.0,
      "completions/min_terminated_length": 575.0,
      "epoch": 0.28114285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.07755483686923981,
      "kl": 0.000576178232828776,
      "learning_rate": 9.443380060197386e-08,
      "loss": 0.0074,
      "num_tokens": 35714451.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.12234010547399521,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3146.0,
      "completions/mean_length": 3162.625,
      "completions/mean_terminated_length": 1825.2174072265625,
      "completions/min_length": 722.0,
      "completions/min_terminated_length": 722.0,
      "epoch": 0.2834285714285714,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11886311322450638,
      "kl": 0.0006281534830729166,
      "learning_rate": 9.428149347714143e-08,
      "loss": 0.0249,
      "num_tokens": 36034413.0,
      "reward": 0.25,
      "reward_std": 0.1988866776227951,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.435285747051239,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3392.0,
      "completions/mean_length": 2739.041748046875,
      "completions/mean_terminated_length": 1605.5609130859375,
      "completions/min_length": 579.0,
      "completions/min_terminated_length": 579.0,
      "epoch": 0.2857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09605032950639725,
      "kl": 0.00054168701171875,
      "learning_rate": 9.412727182773486e-08,
      "loss": 0.0456,
      "num_tokens": 36313561.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.12234010547399521,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3417.0,
      "completions/mean_length": 2860.0,
      "completions/mean_terminated_length": 1477.8182373046875,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 0.288,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10251232981681824,
      "kl": 0.0005782445271809896,
      "learning_rate": 9.397114317029974e-08,
      "loss": 0.0653,
      "num_tokens": 36605005.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3371.0,
      "completions/mean_length": 2641.39599609375,
      "completions/mean_terminated_length": 1376.9267578125,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.29028571428571426,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11962489783763885,
      "kl": 0.0005617141723632812,
      "learning_rate": 9.381311511432658e-08,
      "loss": 0.0987,
      "num_tokens": 36874623.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.2525114417076111,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3017.0,
      "completions/mean_length": 2454.010498046875,
      "completions/mean_terminated_length": 1061.2325439453125,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.2925714285714286,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.10726732760667801,
      "kl": 0.0005575815836588541,
      "learning_rate": 9.36531953618799e-08,
      "loss": 0.0395,
      "num_tokens": 37126294.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.13301503658294678,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3352.0,
      "completions/mean_length": 2805.885498046875,
      "completions/mean_terminated_length": 1249.65625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 0.2948571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12535634636878967,
      "kl": 0.0006647109985351562,
      "learning_rate": 9.34913917072228e-08,
      "loss": 0.0999,
      "num_tokens": 37410617.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.24468021094799042,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3441.0,
      "completions/mean_length": 2837.875,
      "completions/mean_terminated_length": 1537.4857177734375,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 0.29714285714285715,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08393247425556183,
      "kl": 0.0005332628885904948,
      "learning_rate": 9.332771203643714e-08,
      "loss": 0.0075,
      "num_tokens": 37698221.0,
      "reward": 0.40625,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3502.0,
      "completions/mean_length": 2980.27099609375,
      "completions/mean_terminated_length": 1585.4482421875,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.29942857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09301748126745224,
      "kl": 0.0005849202473958334,
      "learning_rate": 9.316216432703917e-08,
      "loss": 0.0428,
      "num_tokens": 38001259.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.22635996341705322,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3474.0,
      "completions/mean_length": 2670.9375,
      "completions/mean_terminated_length": 1678.478271484375,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.3017142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13140074908733368,
      "kl": 0.000514984130859375,
      "learning_rate": 9.299475664759068e-08,
      "loss": 0.0757,
      "num_tokens": 38272585.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.24468019604682922,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3183.0,
      "completions/mean_length": 3229.95849609375,
      "completions/mean_terminated_length": 1695.77783203125,
      "completions/min_length": 867.0,
      "completions/min_terminated_length": 867.0,
      "epoch": 0.304,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07494261860847473,
      "kl": 0.00063323974609375,
      "learning_rate": 9.282549715730579e-08,
      "loss": 0.0298,
      "num_tokens": 38599491.0,
      "reward": 0.1979166716337204,
      "reward_std": 0.09878238290548325,
      "rewards/format_reward/mean": 0.1979166716337204,
      "rewards/format_reward/std": 0.4005205035209656,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3458.0,
      "completions/mean_length": 2853.416748046875,
      "completions/mean_terminated_length": 1392.25,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.3062857142857143,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.07386979460716248,
      "kl": 0.0005450248718261719,
      "learning_rate": 9.265439410565328e-08,
      "loss": 0.0372,
      "num_tokens": 38889769.0,
      "reward": 0.375,
      "reward_std": 0.15177121758460999,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3189.0,
      "completions/mean_length": 2473.64599609375,
      "completions/mean_terminated_length": 1105.06982421875,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 0.30857142857142855,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11156300455331802,
      "kl": 0.0006051063537597656,
      "learning_rate": 9.248145583195447e-08,
      "loss": 0.0609,
      "num_tokens": 39141885.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.20956158638000488,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3333.0,
      "completions/mean_length": 2794.260498046875,
      "completions/mean_terminated_length": 1478.02783203125,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 0.31085714285714283,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12031218409538269,
      "kl": 0.0005391438802083334,
      "learning_rate": 9.230669076497686e-08,
      "loss": 0.0584,
      "num_tokens": 39426376.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3392.0,
      "completions/mean_length": 2761.979248046875,
      "completions/mean_terminated_length": 1451.189208984375,
      "completions/min_length": 618.0,
      "completions/min_terminated_length": 618.0,
      "epoch": 0.31314285714285717,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0906476154923439,
      "kl": 0.0005747477213541666,
      "learning_rate": 9.213010742252327e-08,
      "loss": 0.064,
      "num_tokens": 39708008.0,
      "reward": 0.40625,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3162.0,
      "completions/mean_length": 2545.83349609375,
      "completions/mean_terminated_length": 1463.4892578125,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 0.31542857142857145,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08902504295110703,
      "kl": 0.0005307197570800781,
      "learning_rate": 9.195171441101667e-08,
      "loss": 0.0554,
      "num_tokens": 39968590.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.18404607474803925,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3552.0,
      "completions/mean_length": 2885.4375,
      "completions/mean_terminated_length": 1488.3125,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.3177142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11393805593252182,
      "kl": 0.0006208419799804688,
      "learning_rate": 9.177152042508077e-08,
      "loss": 0.0793,
      "num_tokens": 40262128.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3460.0,
      "completions/mean_length": 2637.36474609375,
      "completions/mean_terminated_length": 1564.5111083984375,
      "completions/min_length": 454.0,
      "completions/min_terminated_length": 454.0,
      "epoch": 0.32,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11296574771404266,
      "kl": 0.0005234082539876302,
      "learning_rate": 9.158953424711624e-08,
      "loss": 0.0365,
      "num_tokens": 40531449.0,
      "reward": 0.53125,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3461.0,
      "completions/mean_length": 2686.83349609375,
      "completions/mean_terminated_length": 1317.4737548828125,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.3222857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11850091814994812,
      "kl": 0.0007069905598958334,
      "learning_rate": 9.140576474687262e-08,
      "loss": 0.0538,
      "num_tokens": 40804793.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.2263599932193756,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3583.0,
      "completions/mean_length": 2979.229248046875,
      "completions/mean_terminated_length": 1648.7333984375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 0.32457142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11879272013902664,
      "kl": 0.0006297429402669271,
      "learning_rate": 9.122022088101613e-08,
      "loss": 0.0812,
      "num_tokens": 41106555.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.21151940524578094,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3579.0,
      "completions/mean_length": 2892.875,
      "completions/mean_terminated_length": 1296.137939453125,
      "completions/min_length": 438.0,
      "completions/min_terminated_length": 438.0,
      "epoch": 0.32685714285714285,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09709058701992035,
      "kl": 0.0006171862284342448,
      "learning_rate": 9.1032911692693e-08,
      "loss": 0.0669,
      "num_tokens": 41399667.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3513.0,
      "completions/mean_length": 2782.64599609375,
      "completions/mean_terminated_length": 1321.3529052734375,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.3291428571428571,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11509748548269272,
      "kl": 0.0006793340047200521,
      "learning_rate": 9.084384631108882e-08,
      "loss": 0.0885,
      "num_tokens": 41684021.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3576.0,
      "completions/mean_length": 2931.479248046875,
      "completions/mean_terminated_length": 1563.290283203125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.3314285714285714,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.1290571242570877,
      "kl": 0.0006090799967447916,
      "learning_rate": 9.065303395098358e-08,
      "loss": 0.0442,
      "num_tokens": 41981907.0,
      "reward": 0.34375,
      "reward_std": 0.28895190358161926,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3542.0,
      "completions/mean_length": 2853.947998046875,
      "completions/mean_terminated_length": 1393.84375,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.33371428571428574,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.14284007251262665,
      "kl": 0.0005645751953125,
      "learning_rate": 9.046048391230248e-08,
      "loss": 0.0364,
      "num_tokens": 42271942.0,
      "reward": 0.34375,
      "reward_std": 0.24315834045410156,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.3333333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3524.0,
      "completions/mean_length": 2461.30224609375,
      "completions/mean_terminated_length": 1659.3751220703125,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.336,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10502398759126663,
      "kl": 0.0005308787027994791,
      "learning_rate": 9.026620557966279e-08,
      "loss": 0.0276,
      "num_tokens": 42523395.0,
      "reward": 0.6145833730697632,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.6145833134651184,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3528.0,
      "completions/mean_length": 3048.791748046875,
      "completions/mean_terminated_length": 1978.375,
      "completions/min_length": 560.0,
      "completions/min_terminated_length": 560.0,
      "epoch": 0.3382857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12605418264865875,
      "kl": 0.0006653467814127604,
      "learning_rate": 9.007020842191633e-08,
      "loss": 0.0445,
      "num_tokens": 42832849.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3009.0,
      "completions/mean_length": 2798.90625,
      "completions/mean_terminated_length": 1228.71875,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.3405714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0965123325586319,
      "kl": 0.0006402333577473959,
      "learning_rate": 8.987250199168807e-08,
      "loss": 0.0422,
      "num_tokens": 43117594.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.15985246002674103,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3252.0,
      "completions/mean_length": 2891.291748046875,
      "completions/mean_terminated_length": 1684.0,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.34285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11779191344976425,
      "kl": 0.0006155967712402344,
      "learning_rate": 8.967309592491052e-08,
      "loss": 0.049,
      "num_tokens": 43411262.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.23899273574352264,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3296.0,
      "completions/mean_length": 2597.447998046875,
      "completions/mean_terminated_length": 1381.465087890625,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 0.34514285714285714,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09505832195281982,
      "kl": 0.0005811055501302084,
      "learning_rate": 8.9471999940354e-08,
      "loss": 0.0627,
      "num_tokens": 43676181.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.2163209617137909,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3479.0,
      "completions/mean_length": 2815.61474609375,
      "completions/mean_terminated_length": 1590.3514404296875,
      "completions/min_length": 527.0,
      "completions/min_terminated_length": 527.0,
      "epoch": 0.3474285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0899856686592102,
      "kl": 0.0006135304768880209,
      "learning_rate": 8.926922383915315e-08,
      "loss": 0.0299,
      "num_tokens": 43962416.0,
      "reward": 0.40625,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3415.0,
      "completions/mean_length": 2734.86474609375,
      "completions/mean_terminated_length": 1438.8157958984375,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 0.3497142857142857,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.16459882259368896,
      "kl": 0.0006030400594075521,
      "learning_rate": 8.906477750432904e-08,
      "loss": 0.1225,
      "num_tokens": 44240029.0,
      "reward": 0.40625,
      "reward_std": 0.30770808458328247,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3537.0,
      "completions/mean_length": 2596.89599609375,
      "completions/mean_terminated_length": 1272.731689453125,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.352,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08611822873353958,
      "kl": 0.0005370775858561198,
      "learning_rate": 8.88586709003076e-08,
      "loss": 0.0104,
      "num_tokens": 44505213.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.13629473745822906,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3564.0,
      "completions/mean_length": 3185.135498046875,
      "completions/mean_terminated_length": 1988.541748046875,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 0.35428571428571426,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.08149396628141403,
      "kl": 0.0005811055501302084,
      "learning_rate": 8.865091407243394e-08,
      "loss": 0.0194,
      "num_tokens": 44826592.0,
      "reward": 0.2708333432674408,
      "reward_std": 0.18536798655986786,
      "rewards/format_reward/mean": 0.2708333432674408,
      "rewards/format_reward/std": 0.44672295451164246,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3545.0,
      "completions/mean_length": 2321.385498046875,
      "completions/mean_terminated_length": 1207.313720703125,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.3565714285714286,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09393462538719177,
      "kl": 0.0006497701009114584,
      "learning_rate": 8.844151714648274e-08,
      "loss": 0.0155,
      "num_tokens": 45064829.0,
      "reward": 0.59375,
      "reward_std": 0.16333210468292236,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3489.0,
      "completions/mean_length": 2997.09375,
      "completions/mean_terminated_length": 2101.28955078125,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 0.3588571428571429,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.10948645323514938,
      "kl": 0.0005507469177246094,
      "learning_rate": 8.823049032816478e-08,
      "loss": 0.03,
      "num_tokens": 45369080.0,
      "reward": 0.4375,
      "reward_std": 0.29962682723999023,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3548.0,
      "completions/mean_length": 3103.791748046875,
      "completions/mean_terminated_length": 1937.571533203125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.36114285714285715,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09392761439085007,
      "kl": 0.0006020863850911459,
      "learning_rate": 8.801784390262942e-08,
      "loss": 0.0215,
      "num_tokens": 45683952.0,
      "reward": 0.354166716337204,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3529.0,
      "completions/mean_length": 2809.36474609375,
      "completions/mean_terminated_length": 1459.2857666015625,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 0.36342857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.15056172013282776,
      "kl": 0.0006561279296875,
      "learning_rate": 8.780358823396351e-08,
      "loss": 0.0551,
      "num_tokens": 45969137.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3489.0,
      "completions/mean_length": 3053.90625,
      "completions/mean_terminated_length": 1699.2222900390625,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 0.3657142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10764044523239136,
      "kl": 0.0006933212280273438,
      "learning_rate": 8.758773376468605e-08,
      "loss": 0.0445,
      "num_tokens": 46278746.0,
      "reward": 0.28125,
      "reward_std": 0.20084446668624878,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3564.0,
      "completions/mean_length": 2655.23974609375,
      "completions/mean_terminated_length": 1174.2432861328125,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.368,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08771516382694244,
      "kl": 0.0005706151326497396,
      "learning_rate": 8.737029101523929e-08,
      "loss": 0.0274,
      "num_tokens": 46549429.0,
      "reward": 0.40625,
      "reward_std": 0.14501188695430756,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3577.0,
      "completions/mean_length": 2715.33349609375,
      "completions/mean_terminated_length": 1598.4761962890625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 0.3702857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11459244787693024,
      "kl": 0.0006163914998372396,
      "learning_rate": 8.715127058347614e-08,
      "loss": 0.0725,
      "num_tokens": 46825299.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.22635996341705322,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3393.0,
      "completions/mean_length": 2602.96875,
      "completions/mean_terminated_length": 1491.1334228515625,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 0.37257142857142855,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13686417043209076,
      "kl": 0.0006634394327799479,
      "learning_rate": 8.693068314414344e-08,
      "loss": 0.0501,
      "num_tokens": 47091750.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.2741113305091858,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3161.0,
      "completions/mean_length": 2683.92724609375,
      "completions/mean_terminated_length": 1310.131591796875,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.37485714285714283,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13948817551136017,
      "kl": 0.0006837844848632812,
      "learning_rate": 8.670853944836176e-08,
      "loss": 0.1137,
      "num_tokens": 47364659.0,
      "reward": 0.4062500596046448,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3441.0,
      "completions/mean_length": 3005.6875,
      "completions/mean_terminated_length": 1448.6923828125,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 0.37714285714285717,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12864314019680023,
      "kl": 0.0006459554036458334,
      "learning_rate": 8.648485032310144e-08,
      "loss": 0.102,
      "num_tokens": 47669495.0,
      "reward": 0.2812500298023224,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3179.0,
      "completions/mean_length": 3095.53125,
      "completions/mean_terminated_length": 1545.1739501953125,
      "completions/min_length": 552.0,
      "completions/min_terminated_length": 552.0,
      "epoch": 0.37942857142857145,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.07900777459144592,
      "kl": 0.0006132125854492188,
      "learning_rate": 8.625962667065488e-08,
      "loss": 0.0139,
      "num_tokens": 47983370.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.13629472255706787,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3384.0,
      "completions/mean_length": 2459.322998046875,
      "completions/mean_terminated_length": 1507.673095703125,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.38171428571428573,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11869540065526962,
      "kl": 0.0005857149759928385,
      "learning_rate": 8.603287946810513e-08,
      "loss": 0.0159,
      "num_tokens": 48235479.0,
      "reward": 0.5625,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04166666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3556.0,
      "completions/mean_length": 3008.48974609375,
      "completions/mean_terminated_length": 1181.86962890625,
      "completions/min_length": 568.0,
      "completions/min_terminated_length": 568.0,
      "epoch": 0.384,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08864536881446838,
      "kl": 0.0005278587341308594,
      "learning_rate": 8.580461976679099e-08,
      "loss": 0.0191,
      "num_tokens": 48540476.0,
      "reward": 0.28125,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2916666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3561.0,
      "completions/mean_length": 2370.20849609375,
      "completions/mean_terminated_length": 1465.3817138671875,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 0.3862857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11368736624717712,
      "kl": 0.000568230946858724,
      "learning_rate": 8.557485869176825e-08,
      "loss": 0.0425,
      "num_tokens": 48783358.0,
      "reward": 0.59375,
      "reward_std": 0.219600647687912,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.3333333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3570.0,
      "completions/mean_length": 2376.916748046875,
      "completions/mean_terminated_length": 1514.71435546875,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.38857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.15671178698539734,
      "kl": 0.0005776087443033854,
      "learning_rate": 8.534360744126753e-08,
      "loss": 0.0616,
      "num_tokens": 49027166.0,
      "reward": 0.5937500596046448,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3260.0,
      "completions/mean_length": 2920.5625,
      "completions/mean_terminated_length": 1309.357177734375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.39085714285714285,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07653896510601044,
      "kl": 0.0005582173665364584,
      "learning_rate": 8.511087728614862e-08,
      "loss": 0.022,
      "num_tokens": 49323590.0,
      "reward": 0.3125,
      "reward_std": 0.09202303737401962,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3552.0,
      "completions/mean_length": 2526.291748046875,
      "completions/mean_terminated_length": 1631.3077392578125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.3931428571428571,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14437232911586761,
      "kl": 0.0005766550699869791,
      "learning_rate": 8.487667956935088e-08,
      "loss": 0.0502,
      "num_tokens": 49581444.0,
      "reward": 0.59375,
      "reward_std": 0.28806596994400024,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3507.0,
      "completions/mean_length": 3077.947998046875,
      "completions/mean_terminated_length": 2155.14697265625,
      "completions/min_length": 566.0,
      "completions/min_terminated_length": 566.0,
      "epoch": 0.3954285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08480256795883179,
      "kl": 0.0005811055501302084,
      "learning_rate": 8.464102570534061e-08,
      "loss": 0.0511,
      "num_tokens": 49893529.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.18208825588226318,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3542.0,
      "completions/mean_length": 2870.09375,
      "completions/mean_terminated_length": 1680.25,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 0.3977142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14392878115177155,
      "kl": 0.0005621910095214844,
      "learning_rate": 8.440392717955475e-08,
      "loss": 0.0238,
      "num_tokens": 50185636.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3126.0,
      "completions/mean_length": 2903.229248046875,
      "completions/mean_terminated_length": 969.8399658203125,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1288723200559616,
      "kl": 0.0005869865417480469,
      "learning_rate": 8.416539554784089e-08,
      "loss": 0.0268,
      "num_tokens": 50480072.0,
      "reward": 0.2708333432674408,
      "reward_std": 0.21764282882213593,
      "rewards/format_reward/mean": 0.2708333432674408,
      "rewards/format_reward/std": 0.44672295451164246,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2978.0,
      "completions/mean_length": 2545.979248046875,
      "completions/mean_terminated_length": 1211.3809814453125,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 0.4022857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09528925269842148,
      "kl": 0.0005660057067871094,
      "learning_rate": 8.392544243589427e-08,
      "loss": 0.0357,
      "num_tokens": 50740494.0,
      "reward": 0.46875,
      "reward_std": 0.17337113618850708,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3576.0,
      "completions/mean_length": 2988.48974609375,
      "completions/mean_terminated_length": 1851.6060791015625,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 0.4045714285714286,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11670532822608948,
      "kl": 0.0006411870320638021,
      "learning_rate": 8.368407953869103e-08,
      "loss": 0.0658,
      "num_tokens": 51043175.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.26735198497772217,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3488.0,
      "completions/mean_length": 3167.71875,
      "completions/mean_terminated_length": 2156.75,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.40685714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0889914259314537,
      "kl": 0.0005296071370442709,
      "learning_rate": 8.344131861991828e-08,
      "loss": 0.0184,
      "num_tokens": 51363368.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3315.0,
      "completions/mean_length": 2948.70849609375,
      "completions/mean_terminated_length": 1551.0667724609375,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 0.40914285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08148004114627838,
      "kl": 0.0005030632019042969,
      "learning_rate": 8.319717151140072e-08,
      "loss": 0.0195,
      "num_tokens": 51662044.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3501.0,
      "completions/mean_length": 2793.03125,
      "completions/mean_terminated_length": 1637.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 0.4114285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12045343965291977,
      "kl": 0.0006186167399088541,
      "learning_rate": 8.295165011252396e-08,
      "loss": 0.0617,
      "num_tokens": 51945883.0,
      "reward": 0.4375,
      "reward_std": 0.23639902472496033,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3406.0,
      "completions/mean_length": 2654.479248046875,
      "completions/mean_terminated_length": 1407.5609130859375,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.4137142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09716926515102386,
      "kl": 0.0005747477213541666,
      "learning_rate": 8.270476638965461e-08,
      "loss": 0.0472,
      "num_tokens": 52216547.0,
      "reward": 0.46875,
      "reward_std": 0.21827873587608337,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3549.0,
      "completions/mean_length": 3025.53125,
      "completions/mean_terminated_length": 1521.9615478515625,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 0.416,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09838991612195969,
      "kl": 0.000541528065999349,
      "learning_rate": 8.245653237555705e-08,
      "loss": 0.0178,
      "num_tokens": 52522814.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.1888476312160492,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569157063961029,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3472.0,
      "completions/mean_length": 2912.39599609375,
      "completions/mean_terminated_length": 1687.7059326171875,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.41828571428571426,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10540112853050232,
      "kl": 0.0005413691202799479,
      "learning_rate": 8.220696016880687e-08,
      "loss": 0.0488,
      "num_tokens": 52818964.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3578.0,
      "completions/mean_length": 2859.30224609375,
      "completions/mean_terminated_length": 1651.4722900390625,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 0.4205714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10342766344547272,
      "kl": 0.0004857381184895833,
      "learning_rate": 8.195606193320136e-08,
      "loss": 0.0414,
      "num_tokens": 53109933.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.20564600825309753,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3420.0,
      "completions/mean_length": 3069.3125,
      "completions/mean_terminated_length": 2086.727294921875,
      "completions/min_length": 670.0,
      "completions/min_terminated_length": 670.0,
      "epoch": 0.4228571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12011417001485825,
      "kl": 0.0006421407063802084,
      "learning_rate": 8.170384989716656e-08,
      "loss": 0.0514,
      "num_tokens": 53420805.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.22440218925476074,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3288.0,
      "completions/mean_length": 2728.21875,
      "completions/mean_terminated_length": 1627.9285888671875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 0.42514285714285716,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12889675796031952,
      "kl": 0.0006151199340820312,
      "learning_rate": 8.145033635316129e-08,
      "loss": 0.078,
      "num_tokens": 53699148.0,
      "reward": 0.5,
      "reward_std": 0.24511615931987762,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3010.0,
      "completions/mean_length": 2672.041748046875,
      "completions/mean_terminated_length": 1395.300048828125,
      "completions/min_length": 489.0,
      "completions/min_terminated_length": 489.0,
      "epoch": 0.42742857142857144,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09514724463224411,
      "kl": 0.0006281534830729166,
      "learning_rate": 8.119553365707802e-08,
      "loss": 0.0136,
      "num_tokens": 53971480.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.1223401129245758,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.08333333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3444.0,
      "completions/mean_length": 3115.510498046875,
      "completions/mean_terminated_length": 1539.681884765625,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 0.4297142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08934146165847778,
      "kl": 0.0006567637125651041,
      "learning_rate": 8.093945422764069e-08,
      "loss": 0.016,
      "num_tokens": 54287393.0,
      "reward": 0.2708333432674408,
      "reward_std": 0.14305409789085388,
      "rewards/format_reward/mean": 0.2708333432674408,
      "rewards/format_reward/std": 0.44672295451164246,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3565.0,
      "completions/mean_length": 3076.89599609375,
      "completions/mean_terminated_length": 1555.5833740234375,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.432,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10708770155906677,
      "kl": 0.0006875991821289062,
      "learning_rate": 8.068211054579943e-08,
      "loss": 0.008,
      "num_tokens": 54598999.0,
      "reward": 0.3125,
      "reward_std": 0.24859580397605896,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3584.0,
      "completions/mean_length": 2818.23974609375,
      "completions/mean_terminated_length": 1212.6129150390625,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.4342857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08781887590885162,
      "kl": 0.0005850791931152344,
      "learning_rate": 8.04235151541222e-08,
      "loss": 0.0144,
      "num_tokens": 54885888.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3579.0,
      "completions/mean_length": 2877.25,
      "completions/mean_terminated_length": 2042.0,
      "completions/min_length": 677.0,
      "completions/min_terminated_length": 677.0,
      "epoch": 0.43657142857142855,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11006169021129608,
      "kl": 0.0006113052368164062,
      "learning_rate": 8.016368065618359e-08,
      "loss": 0.0358,
      "num_tokens": 55178484.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.26735198497772217,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3515.0,
      "completions/mean_length": 2934.20849609375,
      "completions/mean_terminated_length": 1898.0540771484375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.43885714285714283,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10060418397188187,
      "kl": 0.0006313323974609375,
      "learning_rate": 7.990261971595047e-08,
      "loss": 0.043,
      "num_tokens": 55476494.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.18536798655986786,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3563.0,
      "completions/mean_length": 2798.729248046875,
      "completions/mean_terminated_length": 1908.755615234375,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 0.44114285714285717,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11005173623561859,
      "kl": 0.000522454579671224,
      "learning_rate": 7.964034505716476e-08,
      "loss": 0.0196,
      "num_tokens": 55760394.0,
      "reward": 0.5625,
      "reward_std": 0.26343637704849243,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3525.0,
      "completions/mean_length": 3145.041748046875,
      "completions/mean_terminated_length": 2130.896484375,
      "completions/min_length": 843.0,
      "completions/min_terminated_length": 843.0,
      "epoch": 0.44342857142857145,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11194142699241638,
      "kl": 0.0006106694539388021,
      "learning_rate": 7.93768694627233e-08,
      "loss": 0.0607,
      "num_tokens": 56078308.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3458.0,
      "completions/mean_length": 2681.30224609375,
      "completions/mean_terminated_length": 1176.8055419921875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 0.44571428571428573,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.09316386282444,
      "kl": 0.0005966822306315104,
      "learning_rate": 7.911220577405484e-08,
      "loss": 0.009,
      "num_tokens": 56352117.0,
      "reward": 0.40625,
      "reward_std": 0.0765465646982193,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3538.0,
      "completions/mean_length": 3059.58349609375,
      "completions/mean_terminated_length": 1719.407470703125,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 0.448,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0975174605846405,
      "kl": 0.0005820592244466146,
      "learning_rate": 7.884636689049422e-08,
      "loss": 0.0282,
      "num_tokens": 56662079.0,
      "reward": 0.3020833730697632,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3539.0,
      "completions/mean_length": 2882.52099609375,
      "completions/mean_terminated_length": 1763.946044921875,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 0.4502857142857143,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.13356514275074005,
      "kl": 0.0005550384521484375,
      "learning_rate": 7.857936576865357e-08,
      "loss": 0.0199,
      "num_tokens": 56953957.0,
      "reward": 0.46875,
      "reward_std": 0.3116236925125122,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3445.0,
      "completions/mean_length": 3071.135498046875,
      "completions/mean_terminated_length": 1690.34619140625,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.45257142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10992588847875595,
      "kl": 0.0006163914998372396,
      "learning_rate": 7.831121542179086e-08,
      "loss": 0.0085,
      "num_tokens": 57264686.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3561.0,
      "completions/mean_length": 2945.75,
      "completions/mean_terminated_length": 1607.4837646484375,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 0.45485714285714285,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10106715559959412,
      "kl": 0.0005825360616048177,
      "learning_rate": 7.804192891917572e-08,
      "loss": 0.0392,
      "num_tokens": 57562844.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3381.0,
      "completions/mean_length": 3020.75,
      "completions/mean_terminated_length": 1421.1199951171875,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.45714285714285713,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13515648245811462,
      "kl": 0.0005766550699869791,
      "learning_rate": 7.777151938545235e-08,
      "loss": 0.0411,
      "num_tokens": 57869174.0,
      "reward": 0.3125,
      "reward_std": 0.2861081659793854,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3466.0,
      "completions/mean_length": 3027.4375,
      "completions/mean_terminated_length": 1914.3125,
      "completions/min_length": 579.0,
      "completions/min_terminated_length": 579.0,
      "epoch": 0.4594285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12819920480251312,
      "kl": 0.0006488164265950521,
      "learning_rate": 7.75e-08,
      "loss": -0.0065,
      "num_tokens": 58175660.0,
      "reward": 0.40625,
      "reward_std": 0.27019572257995605,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3552.0,
      "completions/mean_length": 2730.55224609375,
      "completions/mean_terminated_length": 1535.7249755859375,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "epoch": 0.4617142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12962771952152252,
      "kl": 0.0005675951639811198,
      "learning_rate": 7.722738399629039e-08,
      "loss": 0.0167,
      "num_tokens": 58454473.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.22831778228282928,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3376.0,
      "completions/mean_length": 2718.95849609375,
      "completions/mean_terminated_length": 1454.6666259765625,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.464,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11856161057949066,
      "kl": 0.0005909601847330729,
      "learning_rate": 7.695368466124297e-08,
      "loss": 0.0439,
      "num_tokens": 58730943.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.23703491687774658,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3486.0,
      "completions/mean_length": 2858.854248046875,
      "completions/mean_terminated_length": 1650.27783203125,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 0.4662857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09397979080677032,
      "kl": 0.0005482037862141927,
      "learning_rate": 7.667891533457717e-08,
      "loss": 0.0432,
      "num_tokens": 59021467.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.19080542027950287,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3552.0,
      "completions/mean_length": 2655.635498046875,
      "completions/mean_terminated_length": 1238.657958984375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.4685714285714286,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13924960792064667,
      "kl": 0.0005423227945963541,
      "learning_rate": 7.640308940816239e-08,
      "loss": 0.0616,
      "num_tokens": 59293232.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3398.0,
      "completions/mean_length": 2803.885498046875,
      "completions/mean_terminated_length": 1503.6944580078125,
      "completions/min_length": 633.0,
      "completions/min_terminated_length": 633.0,
      "epoch": 0.47085714285714286,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09617812931537628,
      "kl": 0.0006184577941894531,
      "learning_rate": 7.612622032536508e-08,
      "loss": 0.0661,
      "num_tokens": 59577705.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.1430540829896927,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3548.0,
      "completions/mean_length": 2478.39599609375,
      "completions/mean_terminated_length": 1417.9183349609375,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.47314285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12245610356330872,
      "kl": 0.0006500879923502604,
      "learning_rate": 7.584832158039378e-08,
      "loss": 0.068,
      "num_tokens": 59831855.0,
      "reward": 0.5729167461395264,
      "reward_std": 0.25055360794067383,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3555.0,
      "completions/mean_length": 2997.36474609375,
      "completions/mean_terminated_length": 2139.974365234375,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.4754285714285714,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.12236214429140091,
      "kl": 0.0006504058837890625,
      "learning_rate": 7.556940671764124e-08,
      "loss": 0.0698,
      "num_tokens": 60136132.0,
      "reward": 0.5,
      "reward_std": 0.3406188488006592,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3427.0,
      "completions/mean_length": 3120.71875,
      "completions/mean_terminated_length": 1995.607177734375,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 0.4777142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09329062700271606,
      "kl": 0.0006885528564453125,
      "learning_rate": 7.528948933102439e-08,
      "loss": 0.0105,
      "num_tokens": 60451075.0,
      "reward": 0.3125000298023224,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3435.0,
      "completions/mean_length": 3035.822998046875,
      "completions/mean_terminated_length": 1391.291748046875,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.48,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11040231585502625,
      "kl": 0.0006093978881835938,
      "learning_rate": 7.500858306332172e-08,
      "loss": 0.0659,
      "num_tokens": 60758588.0,
      "reward": 0.3125000298023224,
      "reward_std": 0.2721535265445709,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3099.0,
      "completions/mean_length": 3452.354248046875,
      "completions/mean_terminated_length": 2179.77783203125,
      "completions/min_length": 1042.0,
      "completions/min_terminated_length": 1042.0,
      "epoch": 0.48228571428571426,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08212711662054062,
      "kl": 0.0005893707275390625,
      "learning_rate": 7.472670160550848e-08,
      "loss": 0.0362,
      "num_tokens": 61106274.0,
      "reward": 0.1458333432674408,
      "reward_std": 0.19276320934295654,
      "rewards/format_reward/mean": 0.1458333283662796,
      "rewards/format_reward/std": 0.3547917604446411,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3243.0,
      "completions/mean_length": 2831.416748046875,
      "completions/mean_terminated_length": 1459.058837890625,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.4845714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11112417280673981,
      "kl": 0.0006930033365885416,
      "learning_rate": 7.444385869608922e-08,
      "loss": 0.0581,
      "num_tokens": 61393804.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.22308027744293213,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3468.0,
      "completions/mean_length": 2847.4375,
      "completions/mean_terminated_length": 1441.2728271484375,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.4868571428571429,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10535288602113724,
      "kl": 0.0006008148193359375,
      "learning_rate": 7.416006812042826e-08,
      "loss": 0.0266,
      "num_tokens": 61682806.0,
      "reward": 0.375,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3546.0,
      "completions/mean_length": 2808.90625,
      "completions/mean_terminated_length": 1676.076904296875,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 0.48914285714285716,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13144823908805847,
      "kl": 0.0006440480550130209,
      "learning_rate": 7.387534371007797e-08,
      "loss": 0.0766,
      "num_tokens": 61968043.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.27934882044792175,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0833333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3489.0,
      "completions/mean_length": 2482.05224609375,
      "completions/mean_terminated_length": 1468.260009765625,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 0.49142857142857144,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14610359072685242,
      "kl": 0.0006596247355143229,
      "learning_rate": 7.358969934210438e-08,
      "loss": 0.0498,
      "num_tokens": 62221914.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.2196006178855896,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3544.0,
      "completions/mean_length": 2841.61474609375,
      "completions/mean_terminated_length": 1708.5,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.4937142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.07923561334609985,
      "kl": 0.0006917317708333334,
      "learning_rate": 7.3303148938411e-08,
      "loss": 0.0332,
      "num_tokens": 62511071.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.11558075994253159,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2866.89599609375,
      "completions/mean_terminated_length": 1559.2353515625,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.496,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12923845648765564,
      "kl": 0.0005966822306315104,
      "learning_rate": 7.301570646506028e-08,
      "loss": 0.0594,
      "num_tokens": 62802661.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.29766902327537537,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3530.0,
      "completions/mean_length": 2852.635498046875,
      "completions/mean_terminated_length": 1871.5364990234375,
      "completions/min_length": 562.0,
      "completions/min_terminated_length": 562.0,
      "epoch": 0.4982857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10057669878005981,
      "kl": 0.0007403691609700521,
      "learning_rate": 7.27273859315928e-08,
      "loss": 0.0616,
      "num_tokens": 63092162.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0833333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3418.0,
      "completions/mean_length": 2374.510498046875,
      "completions/mean_terminated_length": 1261.780029296875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.5005714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1514943689107895,
      "kl": 0.0006774266560872396,
      "learning_rate": 7.243820139034463e-08,
      "loss": 0.0621,
      "num_tokens": 63335175.0,
      "reward": 0.53125,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3493.0,
      "completions/mean_length": 2696.041748046875,
      "completions/mean_terminated_length": 1452.9000244140625,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 0.5028571428571429,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09723657369613647,
      "kl": 0.0006256103515625,
      "learning_rate": 7.214816693576235e-08,
      "loss": 0.0303,
      "num_tokens": 63610441.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.15657275915145874,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3067.0,
      "completions/mean_length": 2935.53125,
      "completions/mean_terminated_length": 1360.6785888671875,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 0.5051428571428571,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.06203387677669525,
      "kl": 0.0005817413330078125,
      "learning_rate": 7.185729670371604e-08,
      "loss": 0.0081,
      "num_tokens": 63908824.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.09006524085998535,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3261.0,
      "completions/mean_length": 2861.479248046875,
      "completions/mean_terminated_length": 1346.51611328125,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.5074285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12795600295066833,
      "kl": 0.0006310145060221354,
      "learning_rate": 7.156560487081051e-08,
      "loss": 0.0535,
      "num_tokens": 64200392.0,
      "reward": 0.34375,
      "reward_std": 0.24183645844459534,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3317.0,
      "completions/mean_length": 2594.78125,
      "completions/mean_terminated_length": 1267.7803955078125,
      "completions/min_length": 563.0,
      "completions/min_terminated_length": 563.0,
      "epoch": 0.5097142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09798948466777802,
      "kl": 0.000594933827718099,
      "learning_rate": 7.127310565369415e-08,
      "loss": 0.0282,
      "num_tokens": 64465505.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.12234010547399521,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3265.0,
      "completions/mean_length": 3218.697998046875,
      "completions/mean_terminated_length": 2181.239990234375,
      "completions/min_length": 654.0,
      "completions/min_terminated_length": 654.0,
      "epoch": 0.512,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11903919279575348,
      "kl": 0.0006157557169596354,
      "learning_rate": 7.097981330836617e-08,
      "loss": 0.0783,
      "num_tokens": 64790640.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.3506578803062439,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3265.0,
      "completions/mean_length": 2455.5,
      "completions/mean_terminated_length": 941.6585083007812,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.5142857142857142,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.07495436072349548,
      "kl": 0.0005944569905598959,
      "learning_rate": 7.068574212948168e-08,
      "loss": -0.0009,
      "num_tokens": 65042346.0,
      "reward": 0.4375,
      "reward_std": 0.11077922582626343,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3180.0,
      "completions/mean_length": 3073.77099609375,
      "completions/mean_terminated_length": 1834.6429443359375,
      "completions/min_length": 767.0,
      "completions/min_terminated_length": 767.0,
      "epoch": 0.5165714285714286,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.1402408331632614,
      "kl": 0.0006856918334960938,
      "learning_rate": 7.039090644965509e-08,
      "loss": 0.055,
      "num_tokens": 65353040.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.3116236627101898,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3357.0,
      "completions/mean_length": 2995.33349609375,
      "completions/mean_terminated_length": 1761.0322265625,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 0.5188571428571429,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0822618380188942,
      "kl": 0.0005601247151692709,
      "learning_rate": 7.009532063876148e-08,
      "loss": 0.043,
      "num_tokens": 65656558.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3571.0,
      "completions/mean_length": 2981.666748046875,
      "completions/mean_terminated_length": 1883.2940673828125,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 0.5211428571428571,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12317512929439545,
      "kl": 0.0006227493286132812,
      "learning_rate": 6.979899910323624e-08,
      "loss": 0.0284,
      "num_tokens": 65959184.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.27606913447380066,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3531.0,
      "completions/mean_length": 2456.197998046875,
      "completions/mean_terminated_length": 1461.0784912109375,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.5234285714285715,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10438801348209381,
      "kl": 0.0005610783894856771,
      "learning_rate": 6.950195628537299e-08,
      "loss": 0.0672,
      "num_tokens": 66209973.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.22308027744293213,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3477.0,
      "completions/mean_length": 2736.041748046875,
      "completions/mean_terminated_length": 1322.77783203125,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 0.5257142857142857,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07311544567346573,
      "kl": 0.0006850560506184896,
      "learning_rate": 6.920420666261962e-08,
      "loss": 0.0119,
      "num_tokens": 66488395.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.08330589532852173,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3501.0,
      "completions/mean_length": 2743.08349609375,
      "completions/mean_terminated_length": 1341.5555419921875,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 0.528,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1263621300458908,
      "kl": 0.0006767908732096354,
      "learning_rate": 6.890576474687262e-08,
      "loss": 0.0659,
      "num_tokens": 66767253.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3556.0,
      "completions/mean_length": 2718.635498046875,
      "completions/mean_terminated_length": 1453.871826171875,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.5302857142857142,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11518718302249908,
      "kl": 0.0006265640258789062,
      "learning_rate": 6.860664508377e-08,
      "loss": 0.043,
      "num_tokens": 67044628.0,
      "reward": 0.4375,
      "reward_std": 0.29090970754623413,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.7083333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3562.0,
      "completions/mean_length": 2087.52099609375,
      "completions/mean_terminated_length": 1373.8154296875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.5325714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12469115853309631,
      "kl": 0.0006173451741536459,
      "learning_rate": 6.83068622519821e-08,
      "loss": 0.04,
      "num_tokens": 67260312.0,
      "reward": 0.7083333730697632,
      "reward_std": 0.22112248837947845,
      "rewards/format_reward/mean": 0.7083333134651184,
      "rewards/format_reward/std": 0.4569157063961029,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3559.0,
      "completions/mean_length": 2580.197998046875,
      "completions/mean_terminated_length": 1113.1025390625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.5348571428571428,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07785586267709732,
      "kl": 0.0006580352783203125,
      "learning_rate": 6.800643086250122e-08,
      "loss": -0.0065,
      "num_tokens": 67522945.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.085263691842556,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3516.0,
      "completions/mean_length": 2848.385498046875,
      "completions/mean_terminated_length": 1818.5250244140625,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 0.5371428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10632475465536118,
      "kl": 0.0005680720011393229,
      "learning_rate": 6.770536555792944e-08,
      "loss": 0.0955,
      "num_tokens": 67812518.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.24183645844459534,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3554.0,
      "completions/mean_length": 2449.1875,
      "completions/mean_terminated_length": 1050.465087890625,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.5394285714285715,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.13193044066429138,
      "kl": 0.0006238619486490885,
      "learning_rate": 6.740368101176495e-08,
      "loss": 0.035,
      "num_tokens": 68063360.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.20564600825309753,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3029.0,
      "completions/mean_length": 2447.40625,
      "completions/mean_terminated_length": 1262.44677734375,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.5417142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11480724811553955,
      "kl": 0.0006078084309895834,
      "learning_rate": 6.710139192768695e-08,
      "loss": 0.0312,
      "num_tokens": 68314673.0,
      "reward": 0.53125,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.3333333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3576.0,
      "completions/mean_length": 2243.0625,
      "completions/mean_terminated_length": 1285.25,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 0.544,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07083650678396225,
      "kl": 0.0006602605183919271,
      "learning_rate": 6.67985130388389e-08,
      "loss": 0.0415,
      "num_tokens": 68545403.0,
      "reward": 0.6145833730697632,
      "reward_std": 0.09878238290548325,
      "rewards/format_reward/mean": 0.6145833134651184,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3442.0,
      "completions/mean_length": 2674.08349609375,
      "completions/mean_terminated_length": 1344.2052001953125,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.5462857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12556946277618408,
      "kl": 0.0006933212280273438,
      "learning_rate": 6.649505910711058e-08,
      "loss": 0.0903,
      "num_tokens": 68817835.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.29766905307769775,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2710.0,
      "completions/mean_length": 2733.510498046875,
      "completions/mean_terminated_length": 1182.61767578125,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.5485714285714286,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.07574421167373657,
      "kl": 0.0006259282430013021,
      "learning_rate": 6.619104492241847e-08,
      "loss": 0.0495,
      "num_tokens": 69096842.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.10074018687009811,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3507.0,
      "completions/mean_length": 3001.28125,
      "completions/mean_terminated_length": 1586.107177734375,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 0.5508571428571428,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.13523098826408386,
      "kl": 0.0006052652994791666,
      "learning_rate": 6.588648530198504e-08,
      "loss": 0.0916,
      "num_tokens": 69401945.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.2961471974849701,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3518.0,
      "completions/mean_length": 2763.11474609375,
      "completions/mean_terminated_length": 1454.1351318359375,
      "completions/min_length": 664.0,
      "completions/min_terminated_length": 664.0,
      "epoch": 0.5531428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12402091920375824,
      "kl": 0.000682830810546875,
      "learning_rate": 6.558139508961654e-08,
      "loss": 0.0864,
      "num_tokens": 69682702.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.24859580397605896,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2449.0,
      "completions/mean_length": 2568.71875,
      "completions/mean_terminated_length": 799.2285766601562,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.5554285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09158474206924438,
      "kl": 0.0006755193074544271,
      "learning_rate": 6.527578915497951e-08,
      "loss": 0.0157,
      "num_tokens": 69945055.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.1343369483947754,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3262.0,
      "completions/mean_length": 2285.42724609375,
      "completions/mean_terminated_length": 1186.6346435546875,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 0.5577142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09291835129261017,
      "kl": 0.0005892117818196615,
      "learning_rate": 6.496968239287604e-08,
      "loss": 0.028,
      "num_tokens": 70179798.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.14501188695430756,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3365.0,
      "completions/mean_length": 2913.15625,
      "completions/mean_terminated_length": 1689.8529052734375,
      "completions/min_length": 534.0,
      "completions/min_terminated_length": 534.0,
      "epoch": 0.56,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0874626636505127,
      "kl": 0.0006361007690429688,
      "learning_rate": 6.466308972251785e-08,
      "loss": 0.0119,
      "num_tokens": 70476069.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.15461497008800507,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3309.0,
      "completions/mean_length": 2953.375,
      "completions/mean_terminated_length": 1803.4117431640625,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 0.5622857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1325761079788208,
      "kl": 0.0006734530131022135,
      "learning_rate": 6.435602608679917e-08,
      "loss": 0.0452,
      "num_tokens": 70775853.0,
      "reward": 0.3750000596046448,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3502.0,
      "completions/mean_length": 2759.03125,
      "completions/mean_terminated_length": 1184.0909423828125,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.5645714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10333006083965302,
      "kl": 0.0006189346313476562,
      "learning_rate": 6.40485064515684e-08,
      "loss": 0.036,
      "num_tokens": 71056518.0,
      "reward": 0.375,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3492.0,
      "completions/mean_length": 2932.635498046875,
      "completions/mean_terminated_length": 1082.760009765625,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.5668571428571428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.08525275439023972,
      "kl": 0.0007171630859375,
      "learning_rate": 6.374054580489874e-08,
      "loss": 0.0231,
      "num_tokens": 71354473.0,
      "reward": 0.2708333432674408,
      "reward_std": 0.11558075994253159,
      "rewards/format_reward/mean": 0.2708333432674408,
      "rewards/format_reward/std": 0.44672295451164246,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3256.0,
      "completions/mean_length": 2645.21875,
      "completions/mean_terminated_length": 1273.15380859375,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.5691428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12027940899133682,
      "kl": 0.0006109873453776041,
      "learning_rate": 6.343215915635761e-08,
      "loss": 0.0539,
      "num_tokens": 71624548.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.219600647687912,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3553.0,
      "completions/mean_length": 2841.36474609375,
      "completions/mean_terminated_length": 1657.1622314453125,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.5714285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12890154123306274,
      "kl": 0.0006311734517415365,
      "learning_rate": 6.31233615362752e-08,
      "loss": 0.0543,
      "num_tokens": 71913285.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.28763002157211304,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3280.0,
      "completions/mean_length": 2801.885498046875,
      "completions/mean_terminated_length": 994.9310302734375,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.5737142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09506624937057495,
      "kl": 0.0006351470947265625,
      "learning_rate": 6.281416799501187e-08,
      "loss": 0.0062,
      "num_tokens": 72198220.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3412.0,
      "completions/mean_length": 2869.21875,
      "completions/mean_terminated_length": 1677.9166259765625,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 0.576,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11403991281986237,
      "kl": 0.0006430943806966146,
      "learning_rate": 6.250459360222459e-08,
      "loss": 0.0382,
      "num_tokens": 72489517.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.22440217435359955,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3557.0,
      "completions/mean_length": 2824.0625,
      "completions/mean_terminated_length": 1438.2940673828125,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.5782857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09145499765872955,
      "kl": 0.0006418228149414062,
      "learning_rate": 6.219465344613258e-08,
      "loss": 0.0355,
      "num_tokens": 72776083.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3336.0,
      "completions/mean_length": 2542.729248046875,
      "completions/mean_terminated_length": 1259.3023681640625,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.5805714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10661627352237701,
      "kl": 0.0005591710408528646,
      "learning_rate": 6.188436263278172e-08,
      "loss": 0.0466,
      "num_tokens": 73036325.0,
      "reward": 0.5,
      "reward_std": 0.2350771129131317,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3234.0,
      "completions/mean_length": 3043.23974609375,
      "completions/mean_terminated_length": 1420.9583740234375,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.5828571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08879940211772919,
      "kl": 0.0006265640258789062,
      "learning_rate": 6.157373628530852e-08,
      "loss": 0.0172,
      "num_tokens": 73343968.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3504.0,
      "completions/mean_length": 2902.73974609375,
      "completions/mean_terminated_length": 1540.21875,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.5851428571428572,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10758217424154282,
      "kl": 0.0006440480550130209,
      "learning_rate": 6.126278954320294e-08,
      "loss": 0.0365,
      "num_tokens": 73640103.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.1988866627216339,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3584.0,
      "completions/mean_length": 2624.229248046875,
      "completions/mean_terminated_length": 1703.632568359375,
      "completions/min_length": 301.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 0.5874285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12144730240106583,
      "kl": 0.0005919138590494791,
      "learning_rate": 6.095153756157051e-08,
      "loss": 0.0444,
      "num_tokens": 73907665.0,
      "reward": 0.5312500596046448,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3429.0,
      "completions/mean_length": 2751.89599609375,
      "completions/mean_terminated_length": 1087.6875,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.5897142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09886621683835983,
      "kl": 0.0006812413533528646,
      "learning_rate": 6.063999551039369e-08,
      "loss": 0.0392,
      "num_tokens": 74187315.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.1343369334936142,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.4166666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3496.0,
      "completions/mean_length": 2210.229248046875,
      "completions/mean_terminated_length": 1310.17236328125,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.592,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11603335291147232,
      "kl": 0.0007600784301757812,
      "learning_rate": 6.032817857379255e-08,
      "loss": 0.0531,
      "num_tokens": 74414479.0,
      "reward": 0.625,
      "reward_std": 0.19408512115478516,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3373.0,
      "completions/mean_length": 2494.25,
      "completions/mean_terminated_length": 1259.2000732421875,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 0.5942857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11546017974615097,
      "kl": 0.0006551742553710938,
      "learning_rate": 6.001610194928464e-08,
      "loss": 0.0249,
      "num_tokens": 74670565.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3528.0,
      "completions/mean_length": 3008.58349609375,
      "completions/mean_terminated_length": 2049.5556640625,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 0.5965714285714285,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11220080405473709,
      "kl": 0.0006383260091145834,
      "learning_rate": 5.970378084704441e-08,
      "loss": 0.0344,
      "num_tokens": 74975733.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3460.0,
      "completions/mean_length": 2530.25,
      "completions/mean_terminated_length": 1431.6595458984375,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.5988571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12949800491333008,
      "kl": 0.0006148020426432291,
      "learning_rate": 5.9391230489161725e-08,
      "loss": -0.0257,
      "num_tokens": 75233955.0,
      "reward": 0.53125,
      "reward_std": 0.2379208654165268,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3579.0,
      "completions/mean_length": 2763.8125,
      "completions/mean_terminated_length": 1663.5609130859375,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 0.6011428571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09129273146390915,
      "kl": 0.0005540847778320312,
      "learning_rate": 5.907846610890012e-08,
      "loss": 0.0452,
      "num_tokens": 75515427.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.16856959462165833,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3337.0,
      "completions/mean_length": 2752.916748046875,
      "completions/mean_terminated_length": 1304.4571533203125,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.6034285714285714,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14278699457645416,
      "kl": 0.0006055831909179688,
      "learning_rate": 5.8765502949954205e-08,
      "loss": 0.0164,
      "num_tokens": 75795661.0,
      "reward": 0.40625,
      "reward_std": 0.2928674817085266,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3352.0,
      "completions/mean_length": 2579.5,
      "completions/mean_terminated_length": 1487.6522216796875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.6057142857142858,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12508751451969147,
      "kl": 0.0006745656331380209,
      "learning_rate": 5.845235626570684e-08,
      "loss": 0.0209,
      "num_tokens": 76059787.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025156140327454,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3554.0,
      "completions/mean_length": 2894.8125,
      "completions/mean_terminated_length": 1970.2926025390625,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 0.608,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11293845623731613,
      "kl": 0.0006990432739257812,
      "learning_rate": 5.813904131848564e-08,
      "loss": 0.05,
      "num_tokens": 76353439.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.2721535563468933,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3391.0,
      "completions/mean_length": 2822.21875,
      "completions/mean_terminated_length": 1552.5833740234375,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.6102857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09845055639743805,
      "kl": 0.000606536865234375,
      "learning_rate": 5.7825573378819105e-08,
      "loss": 0.0232,
      "num_tokens": 76640764.0,
      "reward": 0.40625,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3550.0,
      "completions/mean_length": 2686.6875,
      "completions/mean_terminated_length": 1430.4500732421875,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.6125714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10997117310762405,
      "kl": 0.0006361007690429688,
      "learning_rate": 5.7511967724692364e-08,
      "loss": 0.0231,
      "num_tokens": 76914838.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3470.0,
      "completions/mean_length": 2866.010498046875,
      "completions/mean_terminated_length": 1721.108154296875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 0.6148571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09718002378940582,
      "kl": 0.0006148020426432291,
      "learning_rate": 5.719823964080261e-08,
      "loss": 0.0337,
      "num_tokens": 77206871.0,
      "reward": 0.40625,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3485.0,
      "completions/mean_length": 2799.197998046875,
      "completions/mean_terminated_length": 1431.4000244140625,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 0.6171428571428571,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.15031415224075317,
      "kl": 0.0007104873657226562,
      "learning_rate": 5.688440441781398e-08,
      "loss": 0.065,
      "num_tokens": 77490990.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.3506578803062439,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3561.0,
      "completions/mean_length": 2725.28125,
      "completions/mean_terminated_length": 1573.3414306640625,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 0.6194285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10987356305122375,
      "kl": 0.0006017684936523438,
      "learning_rate": 5.6570477351612554e-08,
      "loss": 0.0556,
      "num_tokens": 77768895.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.22635996341705322,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3483.0,
      "completions/mean_length": 2799.166748046875,
      "completions/mean_terminated_length": 1652.1025390625,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.6217142857142857,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12976664304733276,
      "kl": 0.0006278355916341146,
      "learning_rate": 5.6256473742560605e-08,
      "loss": 0.0381,
      "num_tokens": 78053101.0,
      "reward": 0.46875,
      "reward_std": 0.30638620257377625,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2083333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3533.0,
      "completions/mean_length": 2288.42724609375,
      "completions/mean_terminated_length": 1237.3018798828125,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.624,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1335023194551468,
      "kl": 0.0007028579711914062,
      "learning_rate": 5.594240889475106e-08,
      "loss": 0.0614,
      "num_tokens": 78288264.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.2640722990036011,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512423992157,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3493.0,
      "completions/mean_length": 2633.885498046875,
      "completions/mean_terminated_length": 1511.0228271484375,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.6262857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13292840123176575,
      "kl": 0.0007330576578776041,
      "learning_rate": 5.562829811526154e-08,
      "loss": 0.0492,
      "num_tokens": 78556615.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.24468021094799042,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2875.0,
      "completions/mean_length": 2690.84375,
      "completions/mean_terminated_length": 1202.25,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 0.6285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11393062770366669,
      "kl": 0.0006831487019856771,
      "learning_rate": 5.5314156713408264e-08,
      "loss": 0.0937,
      "num_tokens": 78830620.0,
      "reward": 0.40625,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3519.0,
      "completions/mean_length": 2429.05224609375,
      "completions/mean_terminated_length": 1274.104248046875,
      "completions/min_length": 340.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 0.6308571428571429,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11953698098659515,
      "kl": 0.0007349650065104166,
      "learning_rate": 5.5e-08,
      "loss": 0.049,
      "num_tokens": 79079043.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.20956158638000488,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3483.0,
      "completions/mean_length": 2592.02099609375,
      "completions/mean_terminated_length": 1369.348876953125,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 0.6331428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09889426827430725,
      "kl": 0.0006306966145833334,
      "learning_rate": 5.468584328659173e-08,
      "loss": -0.0167,
      "num_tokens": 79344689.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.12234010547399521,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3580.0,
      "completions/mean_length": 2784.625,
      "completions/mean_terminated_length": 1756.857177734375,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 0.6354285714285715,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13919898867607117,
      "kl": 0.00070953369140625,
      "learning_rate": 5.4371701884738466e-08,
      "loss": 0.0669,
      "num_tokens": 79628219.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.30638617277145386,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3101.0,
      "completions/mean_length": 2894.71875,
      "completions/mean_terminated_length": 1220.75,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 0.6377142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08552590757608414,
      "kl": 0.0006707509358723959,
      "learning_rate": 5.405759110524893e-08,
      "loss": 0.0469,
      "num_tokens": 79921580.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3127.0,
      "completions/mean_length": 2573.40625,
      "completions/mean_terminated_length": 1327.7906494140625,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 0.64,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1325606405735016,
      "kl": 0.0006453196207682291,
      "learning_rate": 5.374352625743941e-08,
      "loss": 0.0791,
      "num_tokens": 80183969.0,
      "reward": 0.46875,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3517.0,
      "completions/mean_length": 2715.625,
      "completions/mean_terminated_length": 1645.3023681640625,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 0.6422857142857142,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12638314068317413,
      "kl": 0.0006529490152994791,
      "learning_rate": 5.342952264838747e-08,
      "loss": 0.0544,
      "num_tokens": 80461163.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.26059263944625854,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3418.0,
      "completions/mean_length": 2836.510498046875,
      "completions/mean_terminated_length": 1833.7803955078125,
      "completions/min_length": 798.0,
      "completions/min_terminated_length": 798.0,
      "epoch": 0.6445714285714286,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13566777110099792,
      "kl": 0.0007076263427734375,
      "learning_rate": 5.3115595582186024e-08,
      "loss": 0.0592,
      "num_tokens": 80749686.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.28219255805015564,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3335.0,
      "completions/mean_length": 2805.625,
      "completions/mean_terminated_length": 1761.46337890625,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 0.6468571428571429,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11968239396810532,
      "kl": 0.0007022221883138021,
      "learning_rate": 5.28017603591974e-08,
      "loss": 0.0728,
      "num_tokens": 81034146.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.29962682723999023,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.08333333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3411.0,
      "completions/mean_length": 3168.6875,
      "completions/mean_terminated_length": 1771.727294921875,
      "completions/min_length": 586.0,
      "completions/min_terminated_length": 586.0,
      "epoch": 0.6491428571428571,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06619197875261307,
      "kl": 0.0005556742350260416,
      "learning_rate": 5.248803227530763e-08,
      "loss": 0.0483,
      "num_tokens": 81355140.0,
      "reward": 0.25,
      "reward_std": 0.15657275915145874,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.435285747051239,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3318.0,
      "completions/mean_length": 2547.0625,
      "completions/mean_terminated_length": 1371.86669921875,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 0.6514285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11932481825351715,
      "kl": 0.0006405512491861979,
      "learning_rate": 5.21744266211809e-08,
      "loss": 0.0696,
      "num_tokens": 81615654.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.24468021094799042,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3562.0,
      "completions/mean_length": 2983.0625,
      "completions/mean_terminated_length": 1594.689697265625,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.6537142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11437805742025375,
      "kl": 0.0006144841512044271,
      "learning_rate": 5.1860958681514355e-08,
      "loss": -0.0055,
      "num_tokens": 81917802.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3505.0,
      "completions/mean_length": 2701.80224609375,
      "completions/mean_terminated_length": 1412.4359130859375,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 0.656,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10379407554864883,
      "kl": 0.000617822011311849,
      "learning_rate": 5.1547643734293155e-08,
      "loss": 0.0689,
      "num_tokens": 82193189.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.17337113618850708,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3017.0,
      "completions/mean_length": 2696.09375,
      "completions/mean_terminated_length": 1216.25,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.6582857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11394447088241577,
      "kl": 0.0006869633992513021,
      "learning_rate": 5.1234497050045814e-08,
      "loss": 0.0097,
      "num_tokens": 82468148.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.1530931293964386,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3506.0,
      "completions/mean_length": 2272.510498046875,
      "completions/mean_terminated_length": 1252.4630126953125,
      "completions/min_length": 284.0,
      "completions/min_terminated_length": 284.0,
      "epoch": 0.6605714285714286,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.16180695593357086,
      "kl": 0.0006192525227864584,
      "learning_rate": 5.09215338910999e-08,
      "loss": 0.0779,
      "num_tokens": 82701429.0,
      "reward": 0.5833333730697632,
      "reward_std": 0.3044283986091614,
      "rewards/format_reward/mean": 0.5833333134651184,
      "rewards/format_reward/std": 0.4955945909023285,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3547.0,
      "completions/mean_length": 2717.0625,
      "completions/mean_terminated_length": 1450.0,
      "completions/min_length": 458.0,
      "completions/min_terminated_length": 458.0,
      "epoch": 0.6628571428571428,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.08857004344463348,
      "kl": 0.0007041295369466146,
      "learning_rate": 5.060876951083828e-08,
      "loss": 0.0593,
      "num_tokens": 82978797.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.21916469931602478,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3273.0,
      "completions/mean_length": 2966.260498046875,
      "completions/mean_terminated_length": 1539.0689697265625,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.6651428571428571,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.06730318069458008,
      "kl": 0.0006262461344401041,
      "learning_rate": 5.02962191529556e-08,
      "loss": 0.0011,
      "num_tokens": 83279800.0,
      "reward": 0.34375,
      "reward_std": 0.0852636992931366,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3522.0,
      "completions/mean_length": 2521.010498046875,
      "completions/mean_terminated_length": 1412.7872314453125,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 0.6674285714285715,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10721820592880249,
      "kl": 0.0006542205810546875,
      "learning_rate": 4.9983898050715357e-08,
      "loss": 0.0605,
      "num_tokens": 83538101.0,
      "reward": 0.53125,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3549.0,
      "completions/mean_length": 2342.34375,
      "completions/mean_terminated_length": 1246.7647705078125,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 0.6697142857142857,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.16255320608615875,
      "kl": 0.0007025400797526041,
      "learning_rate": 4.9671821426207447e-08,
      "loss": 0.0837,
      "num_tokens": 83779070.0,
      "reward": 0.53125,
      "reward_std": 0.3164252042770386,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3576.0,
      "completions/mean_length": 3046.42724609375,
      "completions/mean_terminated_length": 1863.7667236328125,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 0.672,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09268605709075928,
      "kl": 0.0006469090779622396,
      "learning_rate": 4.93600044896063e-08,
      "loss": 0.0276,
      "num_tokens": 84088195.0,
      "reward": 0.34375,
      "reward_std": 0.18208830058574677,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3484.0,
      "completions/mean_length": 2709.229248046875,
      "completions/mean_terminated_length": 1584.5238037109375,
      "completions/min_length": 551.0,
      "completions/min_terminated_length": 551.0,
      "epoch": 0.6742857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.08126060664653778,
      "kl": 0.0006958643595377604,
      "learning_rate": 4.9048462438429486e-08,
      "loss": 0.0407,
      "num_tokens": 84364775.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.11558075994253159,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3373.0,
      "completions/mean_length": 2988.854248046875,
      "completions/mean_terminated_length": 1467.9259033203125,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 0.6765714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.14968274533748627,
      "kl": 0.0007352828979492188,
      "learning_rate": 4.873721045679706e-08,
      "loss": 0.043,
      "num_tokens": 84667329.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.21827873587608337,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3432.0,
      "completions/mean_length": 2834.760498046875,
      "completions/mean_terminated_length": 1468.5,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 0.6788571428571428,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11504609137773514,
      "kl": 0.0006415049235026041,
      "learning_rate": 4.842626371469148e-08,
      "loss": 0.0437,
      "num_tokens": 84956068.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3535.0,
      "completions/mean_length": 2677.36474609375,
      "completions/mean_terminated_length": 1352.2821044921875,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 0.6811428571428572,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14034387469291687,
      "kl": 0.0006526311238606771,
      "learning_rate": 4.811563736721829e-08,
      "loss": 0.0091,
      "num_tokens": 85229037.0,
      "reward": 0.4375,
      "reward_std": 0.2773910164833069,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3537.0,
      "completions/mean_length": 2894.11474609375,
      "completions/mean_terminated_length": 1131.0740966796875,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.6834285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06885220855474472,
      "kl": 0.0006777445475260416,
      "learning_rate": 4.7805346553867434e-08,
      "loss": 0.0079,
      "num_tokens": 85523816.0,
      "reward": 0.2916666567325592,
      "reward_std": 0.10206207633018494,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3319.0,
      "completions/mean_length": 2698.15625,
      "completions/mean_terminated_length": 1403.4615478515625,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 0.6857142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.12070653587579727,
      "kl": 0.0006907780965169271,
      "learning_rate": 4.7495406397775394e-08,
      "loss": 0.0504,
      "num_tokens": 85798883.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.16856960952281952,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3530.0,
      "completions/mean_length": 3193.9375,
      "completions/mean_terminated_length": 1800.857177734375,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 0.688,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10006493330001831,
      "kl": 0.0006546974182128906,
      "learning_rate": 4.718583200498813e-08,
      "loss": 0.0494,
      "num_tokens": 86121749.0,
      "reward": 0.2708333432674408,
      "reward_std": 0.2856721878051758,
      "rewards/format_reward/mean": 0.2708333432674408,
      "rewards/format_reward/std": 0.44672295451164246,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2942.0,
      "completions/mean_length": 2248.5625,
      "completions/mean_terminated_length": 1070.2353515625,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.6902857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.1160050481557846,
      "kl": 0.0007260640462239584,
      "learning_rate": 4.6876638463724804e-08,
      "loss": 0.0177,
      "num_tokens": 86353043.0,
      "reward": 0.6354166865348816,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.6354166865348816,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3340.0,
      "completions/mean_length": 2885.58349609375,
      "completions/mean_terminated_length": 1552.242431640625,
      "completions/min_length": 481.0,
      "completions/min_terminated_length": 481.0,
      "epoch": 0.6925714285714286,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10667046159505844,
      "kl": 0.0007079442342122396,
      "learning_rate": 4.656784084364238e-08,
      "loss": 0.0059,
      "num_tokens": 86646205.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.17337113618850708,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3581.0,
      "completions/mean_length": 2266.83349609375,
      "completions/mean_terminated_length": 1365.614013671875,
      "completions/min_length": 253.0,
      "completions/min_terminated_length": 253.0,
      "epoch": 0.6948571428571428,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.062273427844047546,
      "kl": 0.0006469090779622396,
      "learning_rate": 4.6259454195101264e-08,
      "loss": 0.0107,
      "num_tokens": 86879223.0,
      "reward": 0.6145833730697632,
      "reward_std": 0.05779037997126579,
      "rewards/format_reward/mean": 0.6145833134651184,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3108.0,
      "completions/mean_length": 2927.39599609375,
      "completions/mean_terminated_length": 1159.615478515625,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.6971428571428572,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.08911117911338806,
      "kl": 0.0006799697875976562,
      "learning_rate": 4.59514935484316e-08,
      "loss": 0.0798,
      "num_tokens": 87177995.0,
      "reward": 0.28125,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3402.0,
      "completions/mean_length": 2868.885498046875,
      "completions/mean_terminated_length": 1438.65625,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.6994285714285714,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09344348311424255,
      "kl": 0.000652313232421875,
      "learning_rate": 4.564397391320084e-08,
      "loss": 0.0681,
      "num_tokens": 87469650.0,
      "reward": 0.34375,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3469.0,
      "completions/mean_length": 3219.0,
      "completions/mean_terminated_length": 2182.39990234375,
      "completions/min_length": 643.0,
      "completions/min_terminated_length": 643.0,
      "epoch": 0.7017142857142857,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11024210602045059,
      "kl": 0.0006062189737955729,
      "learning_rate": 4.533691027748215e-08,
      "loss": 0.0365,
      "num_tokens": 87795084.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.2957112491130829,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3483.0,
      "completions/mean_length": 2657.55224609375,
      "completions/mean_terminated_length": 1243.5,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 0.704,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.145328551530838,
      "kl": 0.0007899602254231771,
      "learning_rate": 4.5030317607123966e-08,
      "loss": 0.0615,
      "num_tokens": 88065407.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.2841503620147705,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2607.572998046875,
      "completions/mean_terminated_length": 1404.0697021484375,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 0.7062857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11172831803560257,
      "kl": 0.0006608963012695312,
      "learning_rate": 4.4724210845020494e-08,
      "loss": 0.0783,
      "num_tokens": 88331868.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.22112248837947845,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3572.0,
      "completions/mean_length": 2582.479248046875,
      "completions/mean_terminated_length": 1238.9755859375,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.7085714285714285,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12015148997306824,
      "kl": 0.0005992253621419271,
      "learning_rate": 4.441860491038345e-08,
      "loss": 0.032,
      "num_tokens": 88595230.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3057.0,
      "completions/mean_length": 2851.479248046875,
      "completions/mean_terminated_length": 1315.54833984375,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.7108571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0867365375161171,
      "kl": 0.0006038347880045573,
      "learning_rate": 4.4113514698014955e-08,
      "loss": 0.0452,
      "num_tokens": 88885868.0,
      "reward": 0.34375,
      "reward_std": 0.15985244512557983,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3521.0,
      "completions/mean_length": 2841.166748046875,
      "completions/mean_terminated_length": 1355.5,
      "completions/min_length": 560.0,
      "completions/min_terminated_length": 560.0,
      "epoch": 0.7131428571428572,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.1219867542386055,
      "kl": 0.0006688435872395834,
      "learning_rate": 4.380895507758154e-08,
      "loss": 0.059,
      "num_tokens": 89174574.0,
      "reward": 0.375,
      "reward_std": 0.28219255805015564,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2859.0,
      "completions/mean_length": 2503.61474609375,
      "completions/mean_terminated_length": 1171.9766845703125,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.7154285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11462200433015823,
      "kl": 0.0006494522094726562,
      "learning_rate": 4.350494089288943e-08,
      "loss": 0.0293,
      "num_tokens": 89431583.0,
      "reward": 0.5104167461395264,
      "reward_std": 0.17337113618850708,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3104.0,
      "completions/mean_length": 2577.260498046875,
      "completions/mean_terminated_length": 1226.756103515625,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.7177142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09503225237131119,
      "kl": 0.0007216135660807291,
      "learning_rate": 4.3201486961161095e-08,
      "loss": 0.0493,
      "num_tokens": 89694846.0,
      "reward": 0.46875,
      "reward_std": 0.22440217435359955,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3560.0,
      "completions/mean_length": 2438.635498046875,
      "completions/mean_terminated_length": 1140.5555419921875,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.72,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12994083762168884,
      "kl": 0.0006510416666666666,
      "learning_rate": 4.289860807231305e-08,
      "loss": 0.0514,
      "num_tokens": 89945743.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.22635996341705322,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3217.0,
      "completions/mean_length": 2738.52099609375,
      "completions/mean_terminated_length": 1604.3414306640625,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 0.7222857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10149212926626205,
      "kl": 0.0007270177205403646,
      "learning_rate": 4.2596318988235035e-08,
      "loss": 0.0241,
      "num_tokens": 90225123.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.1801304817199707,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3518.0,
      "completions/mean_length": 2962.86474609375,
      "completions/mean_terminated_length": 1596.36669921875,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 0.7245714285714285,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10409843921661377,
      "kl": 0.0006618499755859375,
      "learning_rate": 4.2294634442070557e-08,
      "loss": 0.0227,
      "num_tokens": 90526292.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3224.0,
      "completions/mean_length": 2703.625,
      "completions/mean_terminated_length": 1571.71435546875,
      "completions/min_length": 522.0,
      "completions/min_terminated_length": 522.0,
      "epoch": 0.7268571428571429,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11185863614082336,
      "kl": 0.0006682078043619791,
      "learning_rate": 4.1993569137498776e-08,
      "loss": 0.0496,
      "num_tokens": 90802334.0,
      "reward": 0.5,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3489.0,
      "completions/mean_length": 3009.260498046875,
      "completions/mean_terminated_length": 1540.4814453125,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 0.7291428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12368905544281006,
      "kl": 0.0007559458414713541,
      "learning_rate": 4.1693137748017915e-08,
      "loss": 0.0715,
      "num_tokens": 91107879.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3557.0,
      "completions/mean_length": 2733.260498046875,
      "completions/mean_terminated_length": 1181.9117431640625,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.7314285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1384938657283783,
      "kl": 0.0007680257161458334,
      "learning_rate": 4.1393354916230006e-08,
      "loss": 0.0818,
      "num_tokens": 91386046.0,
      "reward": 0.4375,
      "reward_std": 0.24511614441871643,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3416.0,
      "completions/mean_length": 2418.979248046875,
      "completions/mean_terminated_length": 1301.5101318359375,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.7337142857142858,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14306552708148956,
      "kl": 0.0007654825846354166,
      "learning_rate": 4.1094235253127375e-08,
      "loss": 0.0949,
      "num_tokens": 91633496.0,
      "reward": 0.53125,
      "reward_std": 0.3111877143383026,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3522.0,
      "completions/mean_length": 2838.3125,
      "completions/mean_terminated_length": 1414.727294921875,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 0.736,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13922828435897827,
      "kl": 0.0007216135660807291,
      "learning_rate": 4.079579333738039e-08,
      "loss": 0.0487,
      "num_tokens": 91922564.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3510.0,
      "completions/mean_length": 2946.822998046875,
      "completions/mean_terminated_length": 1610.806396484375,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.7382857142857143,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12766923010349274,
      "kl": 0.0007079442342122396,
      "learning_rate": 4.049804371462701e-08,
      "loss": 0.0639,
      "num_tokens": 92221317.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.26059263944625854,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3496.0,
      "completions/mean_length": 2670.479248046875,
      "completions/mean_terminated_length": 1495.952392578125,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.7405714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11286967992782593,
      "kl": 0.0006717046101888021,
      "learning_rate": 4.020100089676376e-08,
      "loss": 0.046,
      "num_tokens": 92493745.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.2466380000114441,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2596.0,
      "completions/mean_length": 1620.2083740234375,
      "completions/mean_terminated_length": 727.5758056640625,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.7428571428571429,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12034157663583755,
      "kl": 0.0006987253824869791,
      "learning_rate": 3.990467936123853e-08,
      "loss": 0.073,
      "num_tokens": 92664135.0,
      "reward": 0.6875,
      "reward_std": 0.18536798655986786,
      "rewards/format_reward/mean": 0.6875,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2718.0,
      "completions/mean_length": 2402.6875,
      "completions/mean_terminated_length": 1171.1063232421875,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 0.7451428571428571,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10850421339273453,
      "kl": 0.0007120768229166666,
      "learning_rate": 3.960909355034491e-08,
      "loss": 0.0525,
      "num_tokens": 92910705.0,
      "reward": 0.5104167461395264,
      "reward_std": 0.1921273171901703,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3081.0,
      "completions/mean_length": 3023.33349609375,
      "completions/mean_terminated_length": 1431.0399169921875,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 0.7474285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1040889248251915,
      "kl": 0.0006821950276692709,
      "learning_rate": 3.931425787051832e-08,
      "loss": 0.0867,
      "num_tokens": 93217169.0,
      "reward": 0.3020833730697632,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3288.0,
      "completions/mean_length": 2443.197998046875,
      "completions/mean_terminated_length": 1203.1956787109375,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.7497142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0960441380739212,
      "kl": 0.0006745656331380209,
      "learning_rate": 3.9020186691633835e-08,
      "loss": 0.0155,
      "num_tokens": 93467232.0,
      "reward": 0.46875,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3566.0,
      "completions/mean_length": 2851.48974609375,
      "completions/mean_terminated_length": 1515.7353515625,
      "completions/min_length": 544.0,
      "completions/min_terminated_length": 544.0,
      "epoch": 0.752,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12683966755867004,
      "kl": 0.0006411870320638021,
      "learning_rate": 3.8726894346305846e-08,
      "loss": 0.0939,
      "num_tokens": 93757343.0,
      "reward": 0.375,
      "reward_std": 0.26735198497772217,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3498.0,
      "completions/mean_length": 2779.979248046875,
      "completions/mean_terminated_length": 1439.9444580078125,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.7542857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10126888751983643,
      "kl": 0.0005882581075032552,
      "learning_rate": 3.843439512918949e-08,
      "loss": 0.0151,
      "num_tokens": 94039881.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.20084446668624878,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.48924845457077026,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3549.0,
      "completions/mean_length": 2738.9375,
      "completions/mean_terminated_length": 1697.348876953125,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 0.7565714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12018129974603653,
      "kl": 0.0006844202677408854,
      "learning_rate": 3.814270329628395e-08,
      "loss": 0.0451,
      "num_tokens": 94318815.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.25731295347213745,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3470.0,
      "completions/mean_length": 2548.42724609375,
      "completions/mean_terminated_length": 1272.023193359375,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 0.7588571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1146484985947609,
      "kl": 0.0006783803304036459,
      "learning_rate": 3.785183306423767e-08,
      "loss": 0.0466,
      "num_tokens": 94579376.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3445.0,
      "completions/mean_length": 2462.11474609375,
      "completions/mean_terminated_length": 891.4750366210938,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.7611428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1057872623205185,
      "kl": 0.0007009506225585938,
      "learning_rate": 3.756179860965537e-08,
      "loss": 0.0786,
      "num_tokens": 94830823.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.22440217435359955,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3357.0,
      "completions/mean_length": 2491.541748046875,
      "completions/mean_terminated_length": 1304.0870361328125,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.7634285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1354176104068756,
      "kl": 0.0006554921468098959,
      "learning_rate": 3.72726140684072e-08,
      "loss": 0.0624,
      "num_tokens": 95085887.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.23703491687774658,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3452.0,
      "completions/mean_length": 2790.34375,
      "completions/mean_terminated_length": 1467.5833740234375,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.7657142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10556024312973022,
      "kl": 0.0007130304972330729,
      "learning_rate": 3.698429353493974e-08,
      "loss": 0.041,
      "num_tokens": 95369774.0,
      "reward": 0.4375,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3576.0,
      "completions/mean_length": 2687.52099609375,
      "completions/mean_terminated_length": 1628.0455322265625,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.768,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1153315082192421,
      "kl": 0.0007044474283854166,
      "learning_rate": 3.669685106158899e-08,
      "loss": 0.0709,
      "num_tokens": 95644972.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.2566770315170288,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3554.0,
      "completions/mean_length": 2809.30224609375,
      "completions/mean_terminated_length": 1459.1142578125,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 0.7702857142857142,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11351940035820007,
      "kl": 0.0006767908732096354,
      "learning_rate": 3.641030065789562e-08,
      "loss": 0.0491,
      "num_tokens": 95931273.0,
      "reward": 0.40625,
      "reward_std": 0.25187548995018005,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3510.0,
      "completions/mean_length": 2894.135498046875,
      "completions/mean_terminated_length": 1514.40625,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 0.7725714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.1154508963227272,
      "kl": 0.0006427764892578125,
      "learning_rate": 3.612465628992203e-08,
      "loss": 0.0637,
      "num_tokens": 96225760.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.4583333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3414.0,
      "completions/mean_length": 2150.77099609375,
      "completions/mean_terminated_length": 1251.966064453125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.7748571428571429,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12287288159132004,
      "kl": 0.0007944107055664062,
      "learning_rate": 3.583993187957173e-08,
      "loss": 0.093,
      "num_tokens": 96448392.0,
      "reward": 0.6354166865348816,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.6354166865348816,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3502.0,
      "completions/mean_length": 2877.80224609375,
      "completions/mean_terminated_length": 1647.0,
      "completions/min_length": 600.0,
      "completions/min_terminated_length": 600.0,
      "epoch": 0.7771428571428571,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12801849842071533,
      "kl": 0.0006701151529947916,
      "learning_rate": 3.555614130391079e-08,
      "loss": 0.1074,
      "num_tokens": 96740495.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3425.0,
      "completions/mean_length": 2869.46875,
      "completions/mean_terminated_length": 1371.258056640625,
      "completions/min_length": 650.0,
      "completions/min_terminated_length": 650.0,
      "epoch": 0.7794285714285715,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08208385109901428,
      "kl": 0.0006910959879557291,
      "learning_rate": 3.527329839449151e-08,
      "loss": 0.0053,
      "num_tokens": 97031972.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.1498134285211563,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3543.0,
      "completions/mean_length": 2997.42724609375,
      "completions/mean_terminated_length": 1418.1923828125,
      "completions/min_length": 440.0,
      "completions/min_terminated_length": 440.0,
      "epoch": 0.7817142857142857,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10631714016199112,
      "kl": 0.0007766087849934896,
      "learning_rate": 3.4991416936678275e-08,
      "loss": 0.072,
      "num_tokens": 97336189.0,
      "reward": 0.3125000298023224,
      "reward_std": 0.26735198497772217,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3217.0,
      "completions/mean_length": 2630.229248046875,
      "completions/mean_terminated_length": 1350.7803955078125,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.784,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11815189570188522,
      "kl": 0.0007492701212565104,
      "learning_rate": 3.4710510668975625e-08,
      "loss": 0.0648,
      "num_tokens": 97604297.0,
      "reward": 0.46875,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3423.0,
      "completions/mean_length": 2484.42724609375,
      "completions/mean_terminated_length": 1629.2037353515625,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.7862857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14397567510604858,
      "kl": 0.0006866455078125,
      "learning_rate": 3.4430593282358775e-08,
      "loss": 0.0537,
      "num_tokens": 97858234.0,
      "reward": 0.59375,
      "reward_std": 0.292867511510849,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3552.0,
      "completions/mean_length": 3289.385498046875,
      "completions/mean_terminated_length": 2237.1904296875,
      "completions/min_length": 691.0,
      "completions/min_terminated_length": 691.0,
      "epoch": 0.7885714285714286,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11778136342763901,
      "kl": 0.0006767908732096354,
      "learning_rate": 3.4151678419606236e-08,
      "loss": 0.0647,
      "num_tokens": 98190275.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.29090970754623413,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3513.0,
      "completions/mean_length": 2899.73974609375,
      "completions/mean_terminated_length": 1808.6217041015625,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 0.7908571428571428,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.1161077618598938,
      "kl": 0.0007422765096028646,
      "learning_rate": 3.387377967463493e-08,
      "loss": 0.0585,
      "num_tokens": 98484130.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.29962682723999023,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.4955946207046509,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3493.0,
      "completions/mean_length": 2883.55224609375,
      "completions/mean_terminated_length": 1606.2647705078125,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 0.7931428571428571,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12117592990398407,
      "kl": 0.0007146199544270834,
      "learning_rate": 3.359691059183761e-08,
      "loss": 0.072,
      "num_tokens": 98777283.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.30638617277145386,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3457.0,
      "completions/mean_length": 2407.635498046875,
      "completions/mean_terminated_length": 1074.4222412109375,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.7954285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09884366393089294,
      "kl": 0.0006717046101888021,
      "learning_rate": 3.332108466542281e-08,
      "loss": 0.0584,
      "num_tokens": 99023944.0,
      "reward": 0.4687500596046448,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3320.0,
      "completions/mean_length": 3037.947998046875,
      "completions/mean_terminated_length": 1567.8077392578125,
      "completions/min_length": 539.0,
      "completions/min_terminated_length": 539.0,
      "epoch": 0.7977142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09411304444074631,
      "kl": 0.0007225672403971354,
      "learning_rate": 3.3046315338757025e-08,
      "loss": 0.0547,
      "num_tokens": 99332093.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.2163209170103073,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.45691564679145813,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3508.0,
      "completions/mean_length": 2614.635498046875,
      "completions/mean_terminated_length": 1068.8919677734375,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.05556942895054817,
      "kl": 0.0007216135660807291,
      "learning_rate": 3.2772616003709616e-08,
      "loss": 0.0156,
      "num_tokens": 99598110.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.08330589532852173,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.16666666666666663,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3128.0,
      "completions/mean_length": 3123.5,
      "completions/mean_terminated_length": 1373.5999755859375,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 0.8022857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10989365726709366,
      "kl": 0.0006758371988932291,
      "learning_rate": 3.250000000000001e-08,
      "loss": 0.0664,
      "num_tokens": 99914568.0,
      "reward": 0.2291666716337204,
      "reward_std": 0.23116151988506317,
      "rewards/format_reward/mean": 0.2291666716337204,
      "rewards/format_reward/std": 0.42250296473503113,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 1704.0,
      "completions/mean_length": 2720.541748046875,
      "completions/mean_terminated_length": 910.0645141601562,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.8045714285714286,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09264727681875229,
      "kl": 0.0007136662801106771,
      "learning_rate": 3.222848061454764e-08,
      "loss": 0.0383,
      "num_tokens": 100191934.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.1343369334936142,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2655.0,
      "completions/mean_length": 2522.14599609375,
      "completions/mean_terminated_length": 1097.707275390625,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.8068571428571428,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1318623125553131,
      "kl": 0.0012060801188151042,
      "learning_rate": 3.195807108082429e-08,
      "loss": 0.0476,
      "num_tokens": 100449906.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.18404607474803925,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3136.0,
      "completions/mean_length": 2691.5,
      "completions/mean_terminated_length": 1329.26318359375,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.8091428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10259856283664703,
      "kl": 0.000599066416422526,
      "learning_rate": 3.168878457820915e-08,
      "loss": 0.0466,
      "num_tokens": 100723950.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.22440217435359955,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3548.0,
      "completions/mean_length": 2866.635498046875,
      "completions/mean_terminated_length": 1771.7105712890625,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 0.8114285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12641818821430206,
      "kl": 0.0007648468017578125,
      "learning_rate": 3.1420634231346446e-08,
      "loss": 0.056,
      "num_tokens": 101015539.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.2828284502029419,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.5416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3431.0,
      "completions/mean_length": 2200.28125,
      "completions/mean_terminated_length": 1406.34423828125,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 0.8137142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13699595630168915,
      "kl": 0.0007009506225585938,
      "learning_rate": 3.1153633109505784e-08,
      "loss": 0.0562,
      "num_tokens": 101242588.0,
      "reward": 0.65625,
      "reward_std": 0.2466380000114441,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3553.0,
      "completions/mean_length": 2807.86474609375,
      "completions/mean_terminated_length": 1851.2325439453125,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.816,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13702301681041718,
      "kl": 0.0007762908935546875,
      "learning_rate": 3.088779422594514e-08,
      "loss": 0.0514,
      "num_tokens": 101528007.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.29962682723999023,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2373.041748046875,
      "completions/mean_terminated_length": 1304.549072265625,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.8182857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12015900760889053,
      "kl": 0.0007022221883138021,
      "learning_rate": 3.062313053727671e-08,
      "loss": 0.0235,
      "num_tokens": 101771329.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2831.0,
      "completions/mean_length": 2700.229248046875,
      "completions/mean_terminated_length": 1351.3157958984375,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.8205714285714286,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12044034153223038,
      "kl": 0.0006585121154785156,
      "learning_rate": 3.035965494283524e-08,
      "loss": 0.0362,
      "num_tokens": 102045719.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.20956158638000488,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3560.0,
      "completions/mean_length": 2882.760498046875,
      "completions/mean_terminated_length": 1714.02783203125,
      "completions/min_length": 786.0,
      "completions/min_terminated_length": 786.0,
      "epoch": 0.8228571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11738189309835434,
      "kl": 0.0007073084513346354,
      "learning_rate": 3.0097380284049524e-08,
      "loss": 0.0712,
      "num_tokens": 102339192.0,
      "reward": 0.40625,
      "reward_std": 0.23311933875083923,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3363.0,
      "completions/mean_length": 3011.21875,
      "completions/mean_terminated_length": 1469.115478515625,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 0.8251428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08504962176084518,
      "kl": 0.0006786982218424479,
      "learning_rate": 2.983631934381639e-08,
      "loss": 0.0216,
      "num_tokens": 102644433.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.16856959462165833,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3492.0,
      "completions/mean_length": 2842.625,
      "completions/mean_terminated_length": 1711.0526123046875,
      "completions/min_length": 607.0,
      "completions/min_terminated_length": 607.0,
      "epoch": 0.8274285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10463716834783554,
      "kl": 0.0006786982218424479,
      "learning_rate": 2.957648484587779e-08,
      "loss": 0.0259,
      "num_tokens": 102933069.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2837.0,
      "completions/mean_length": 2513.48974609375,
      "completions/mean_terminated_length": 1014.7750244140625,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.8297142857142857,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10111965984106064,
      "kl": 0.000690460205078125,
      "learning_rate": 2.931788945420058e-08,
      "loss": 0.056,
      "num_tokens": 103190318.0,
      "reward": 0.4375,
      "reward_std": 0.1430540829896927,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3434.0,
      "completions/mean_length": 2549.52099609375,
      "completions/mean_terminated_length": 1274.465087890625,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.832,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12226539850234985,
      "kl": 0.0006707509358723959,
      "learning_rate": 2.906054577235931e-08,
      "loss": 0.036,
      "num_tokens": 103451272.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.24183647334575653,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0833333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3411.0,
      "completions/mean_length": 2429.125,
      "completions/mean_terminated_length": 1366.6400146484375,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.8342857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.16473808884620667,
      "kl": 0.0008948644002278646,
      "learning_rate": 2.8804466342921984e-08,
      "loss": 0.0599,
      "num_tokens": 103700398.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.2773910164833069,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3532.0,
      "completions/mean_length": 2656.1875,
      "completions/mean_terminated_length": 1357.25,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.8365714285714285,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1141432523727417,
      "kl": 0.0007975896199544271,
      "learning_rate": 2.8549663646838718e-08,
      "loss": 0.0871,
      "num_tokens": 103971184.0,
      "reward": 0.4375,
      "reward_std": 0.26343637704849243,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3487.0,
      "completions/mean_length": 2758.75,
      "completions/mean_terminated_length": 1651.707275390625,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.8388571428571429,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12193860113620758,
      "kl": 0.0007162094116210938,
      "learning_rate": 2.8296150102833438e-08,
      "loss": 0.0744,
      "num_tokens": 104252638.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.29634714126586914,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3575.0,
      "completions/mean_length": 2476.61474609375,
      "completions/mean_terminated_length": 1221.5777587890625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.8411428571428572,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.15184471011161804,
      "kl": 0.0006783803304036459,
      "learning_rate": 2.8043938066798646e-08,
      "loss": 0.0577,
      "num_tokens": 104505573.0,
      "reward": 0.5,
      "reward_std": 0.279984712600708,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3153.0,
      "completions/mean_length": 2976.09375,
      "completions/mean_terminated_length": 1571.6207275390625,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 0.8434285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09829284250736237,
      "kl": 0.0007152557373046875,
      "learning_rate": 2.7793039831193132e-08,
      "loss": 0.051,
      "num_tokens": 104807070.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.17337113618850708,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3407.0,
      "completions/mean_length": 2551.53125,
      "completions/mean_terminated_length": 1106.0750732421875,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.8457142857142858,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11349817365407944,
      "kl": 0.0007206598917643229,
      "learning_rate": 2.7543467624442956e-08,
      "loss": 0.032,
      "num_tokens": 105067743.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.16856960952281952,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3571.0,
      "completions/mean_length": 2872.08349609375,
      "completions/mean_terminated_length": 1736.8648681640625,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.848,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09595135599374771,
      "kl": 0.0007193883260091146,
      "learning_rate": 2.729523361034538e-08,
      "loss": 0.0229,
      "num_tokens": 105359933.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.22112248837947845,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2083333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3099.0,
      "completions/mean_length": 2185.729248046875,
      "completions/mean_terminated_length": 1051.2830810546875,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.8502857142857143,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.10922639071941376,
      "kl": 0.0007880528767903646,
      "learning_rate": 2.7048349887476035e-08,
      "loss": 0.0458,
      "num_tokens": 105585075.0,
      "reward": 0.5625,
      "reward_std": 0.09202304482460022,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3152.0,
      "completions/mean_length": 2533.77099609375,
      "completions/mean_terminated_length": 1239.3023681640625,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.8525714285714285,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11121516674757004,
      "kl": 0.0008538564046223959,
      "learning_rate": 2.6802828488599292e-08,
      "loss": 0.0657,
      "num_tokens": 105843623.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.08333333333333337,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3351.0,
      "completions/mean_length": 3105.53125,
      "completions/mean_terminated_length": 1496.1363525390625,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 0.8548571428571429,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.08799436688423157,
      "kl": 0.000644683837890625,
      "learning_rate": 2.655868138008171e-08,
      "loss": 0.0094,
      "num_tokens": 106158698.0,
      "reward": 0.2604166865348816,
      "reward_std": 0.13629473745822906,
      "rewards/format_reward/mean": 0.2604166567325592,
      "rewards/format_reward/std": 0.4411657154560089,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3215.0,
      "completions/mean_length": 2966.28125,
      "completions/mean_terminated_length": 1607.300048828125,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 0.8571428571428571,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12971101701259613,
      "kl": 0.0007556279500325521,
      "learning_rate": 2.631592046130896e-08,
      "loss": 0.0489,
      "num_tokens": 106459577.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.2841503620147705,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3452.0,
      "completions/mean_length": 2502.979248046875,
      "completions/mean_terminated_length": 1375.9573974609375,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.8594285714285714,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13539092242717743,
      "kl": 0.0008398691813151041,
      "learning_rate": 2.6074557564105726e-08,
      "loss": 0.0468,
      "num_tokens": 106715913.0,
      "reward": 0.53125,
      "reward_std": 0.2841503620147705,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3555.0,
      "completions/mean_length": 2986.354248046875,
      "completions/mean_terminated_length": 1990.27783203125,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 0.8617142857142858,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.11568745225667953,
      "kl": 0.0006910959879557291,
      "learning_rate": 2.583460445215911e-08,
      "loss": 0.0814,
      "num_tokens": 107019031.0,
      "reward": 0.40625,
      "reward_std": 0.2941893935203552,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2810.0,
      "completions/mean_length": 2766.322998046875,
      "completions/mean_terminated_length": 1051.838623046875,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.864,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.0754118263721466,
      "kl": 0.00070953369140625,
      "learning_rate": 2.559607282044525e-08,
      "loss": -0.0142,
      "num_tokens": 107300384.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.08330589532852173,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3510.0,
      "completions/mean_length": 2744.45849609375,
      "completions/mean_terminated_length": 1618.243896484375,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 0.8662857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11120415478944778,
      "kl": 0.0007082621256510416,
      "learning_rate": 2.5358974294659374e-08,
      "loss": 0.0461,
      "num_tokens": 107579446.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.2350771278142929,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3386.0,
      "completions/mean_length": 2754.791748046875,
      "completions/mean_terminated_length": 1542.871826171875,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.8685714285714285,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.07520893216133118,
      "kl": 0.0006279945373535156,
      "learning_rate": 2.5123320430649132e-08,
      "loss": 0.0355,
      "num_tokens": 107860820.0,
      "reward": 0.4166666865348816,
      "reward_std": 0.1343369334936142,
      "rewards/format_reward/mean": 0.4166666567325592,
      "rewards/format_reward/std": 0.49559465050697327,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 2954.354248046875,
      "completions/mean_terminated_length": 1695.0625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.8708571428571429,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12509968876838684,
      "kl": 0.0007979075113932291,
      "learning_rate": 2.4889122713851394e-08,
      "loss": 0.0302,
      "num_tokens": 108159756.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.2780269384384155,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3580.0,
      "completions/mean_length": 2414.58349609375,
      "completions/mean_terminated_length": 1292.89794921875,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.8731428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14563868939876556,
      "kl": 0.0008230209350585938,
      "learning_rate": 2.465639255873246e-08,
      "loss": 0.1217,
      "num_tokens": 108407714.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.2437942624092102,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3543.0,
      "completions/mean_length": 3013.78125,
      "completions/mean_terminated_length": 1873.34375,
      "completions/min_length": 485.0,
      "completions/min_terminated_length": 485.0,
      "epoch": 0.8754285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08111817389726639,
      "kl": 0.0006319681803385416,
      "learning_rate": 2.4425141308231766e-08,
      "loss": 0.0282,
      "num_tokens": 108712769.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.16856959462165833,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3499.0,
      "completions/mean_length": 2961.375,
      "completions/mean_terminated_length": 1285.0770263671875,
      "completions/min_length": 569.0,
      "completions/min_terminated_length": 569.0,
      "epoch": 0.8777142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0816965252161026,
      "kl": 0.0006729761759440104,
      "learning_rate": 2.4195380233209007e-08,
      "loss": 0.0401,
      "num_tokens": 109013531.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.17337113618850708,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0833333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3508.0,
      "completions/mean_length": 2472.34375,
      "completions/mean_terminated_length": 1449.6199951171875,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.88,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.1090082898736,
      "kl": 0.0006866455078125,
      "learning_rate": 2.396712053189486e-08,
      "loss": 0.0471,
      "num_tokens": 109266878.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.20564600825309753,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3579.0,
      "completions/mean_length": 2502.010498046875,
      "completions/mean_terminated_length": 1373.9786376953125,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 0.8822857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.13692055642604828,
      "kl": 0.0008602142333984375,
      "learning_rate": 2.3740373329345117e-08,
      "loss": 0.034,
      "num_tokens": 109522185.0,
      "reward": 0.53125,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3546.0,
      "completions/mean_length": 2516.27099609375,
      "completions/mean_terminated_length": 1306.1778564453125,
      "completions/min_length": 409.0,
      "completions/min_terminated_length": 409.0,
      "epoch": 0.8845714285714286,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10094159841537476,
      "kl": 0.0007845560709635416,
      "learning_rate": 2.3515149676898552e-08,
      "loss": 0.0066,
      "num_tokens": 109779251.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3519.0,
      "completions/mean_length": 2394.791748046875,
      "completions/mean_terminated_length": 1205.5833740234375,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 0.8868571428571429,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13009528815746307,
      "kl": 0.000751495361328125,
      "learning_rate": 2.3291460551638238e-08,
      "loss": 0.0254,
      "num_tokens": 110024655.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.24511615931987762,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3308.0,
      "completions/mean_length": 2202.5625,
      "completions/mean_terminated_length": 1257.368408203125,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.8891428571428571,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.12453688681125641,
      "kl": 0.0007718404134114584,
      "learning_rate": 2.306931685585657e-08,
      "loss": 0.0292,
      "num_tokens": 110250729.0,
      "reward": 0.6041666865348816,
      "reward_std": 0.1888476312160492,
      "rewards/format_reward/mean": 0.6041666865348816,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3554.0,
      "completions/mean_length": 3042.70849609375,
      "completions/mean_terminated_length": 1505.43994140625,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.8914285714285715,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06971344351768494,
      "kl": 0.0007470448811848959,
      "learning_rate": 2.284872941652386e-08,
      "loss": -0.0,
      "num_tokens": 110558957.0,
      "reward": 0.3125,
      "reward_std": 0.11558075994253159,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3572.0,
      "completions/mean_length": 2606.4375,
      "completions/mean_terminated_length": 1237.8499755859375,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.8937142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0966411605477333,
      "kl": 0.0006777445475260416,
      "learning_rate": 2.2629708984760707e-08,
      "loss": 0.0423,
      "num_tokens": 110824961.0,
      "reward": 0.4375,
      "reward_std": 0.18404607474803925,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2747.0,
      "completions/mean_length": 2658.072998046875,
      "completions/mean_terminated_length": 890.3939819335938,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.896,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06006186455488205,
      "kl": 0.0006895065307617188,
      "learning_rate": 2.2412266235313973e-08,
      "loss": 0.0232,
      "num_tokens": 111095826.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.10206206887960434,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3393.0,
      "completions/mean_length": 2778.03125,
      "completions/mean_terminated_length": 1239.3636474609375,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.8982857142857142,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.13048560917377472,
      "kl": 0.0007575352986653646,
      "learning_rate": 2.2196411766036488e-08,
      "loss": -0.0169,
      "num_tokens": 111378747.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.18404607474803925,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3331.0,
      "completions/mean_length": 2708.5625,
      "completions/mean_terminated_length": 1182.800048828125,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.9005714285714286,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.09630251675844193,
      "kl": 0.0006806055704752604,
      "learning_rate": 2.1982156097370558e-08,
      "loss": 0.0195,
      "num_tokens": 111653961.0,
      "reward": 0.40625,
      "reward_std": 0.085263691842556,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3521.0,
      "completions/mean_length": 2641.98974609375,
      "completions/mean_terminated_length": 1323.175048828125,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.9028571428571428,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10200249403715134,
      "kl": 0.0007108052571614584,
      "learning_rate": 2.1769509671835225e-08,
      "loss": 0.0547,
      "num_tokens": 111923726.0,
      "reward": 0.4375,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3534.0,
      "completions/mean_length": 2701.8125,
      "completions/mean_terminated_length": 1231.5,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 0.9051428571428571,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10444901883602142,
      "kl": 0.000721136728922526,
      "learning_rate": 2.1558482853517254e-08,
      "loss": 0.0883,
      "num_tokens": 112199078.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.21151940524578094,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3227.0,
      "completions/mean_length": 3058.40625,
      "completions/mean_terminated_length": 1715.2222900390625,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 0.9074285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13178245723247528,
      "kl": 0.0007457733154296875,
      "learning_rate": 2.1349085927566073e-08,
      "loss": 0.0902,
      "num_tokens": 112508141.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.21764282882213593,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.45691564679145813,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3503.0,
      "completions/mean_length": 2329.71875,
      "completions/mean_terminated_length": 1268.4039306640625,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 0.9097142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12231790274381638,
      "kl": 0.0008589426676432291,
      "learning_rate": 2.1141329099692406e-08,
      "loss": 0.0171,
      "num_tokens": 112747184.0,
      "reward": 0.5625,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3501.0,
      "completions/mean_length": 2524.78125,
      "completions/mean_terminated_length": 1272.977294921875,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.912,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.12483935058116913,
      "kl": 0.0007489522298177084,
      "learning_rate": 2.093522249567097e-08,
      "loss": 0.0385,
      "num_tokens": 113004785.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3218.0,
      "completions/mean_length": 2349.260498046875,
      "completions/mean_terminated_length": 1164.9183349609375,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 0.9142857142857143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.12561240792274475,
      "kl": 0.0006993611653645834,
      "learning_rate": 2.0730776160846852e-08,
      "loss": 0.0626,
      "num_tokens": 113245656.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.16856959462165833,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3573.0,
      "completions/mean_length": 2966.760498046875,
      "completions/mean_terminated_length": 1732.28125,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 0.9165714285714286,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.12740644812583923,
      "kl": 0.000766754150390625,
      "learning_rate": 2.0528000059645996e-08,
      "loss": 0.0233,
      "num_tokens": 113547043.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.3203408420085907,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3461.0,
      "completions/mean_length": 2363.92724609375,
      "completions/mean_terminated_length": 1091.9361572265625,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.9188571428571428,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11828096956014633,
      "kl": 0.0008023579915364584,
      "learning_rate": 2.032690407508949e-08,
      "loss": 0.0352,
      "num_tokens": 113790360.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3477.0,
      "completions/mean_length": 2552.23974609375,
      "completions/mean_terminated_length": 1562.591796875,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.9211428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11285211890935898,
      "kl": 0.0007266998291015625,
      "learning_rate": 2.0127498008311923e-08,
      "loss": 0.0397,
      "num_tokens": 114051569.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3406.0,
      "completions/mean_length": 2857.42724609375,
      "completions/mean_terminated_length": 1333.9676513671875,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 0.9234285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10632804781198502,
      "kl": 0.0008452733357747396,
      "learning_rate": 1.9929791578083655e-08,
      "loss": 0.064,
      "num_tokens": 114341740.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.22635996341705322,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.48077061772346497,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3487.0,
      "completions/mean_length": 3149.375,
      "completions/mean_terminated_length": 2038.6666259765625,
      "completions/min_length": 765.0,
      "completions/min_terminated_length": 765.0,
      "epoch": 0.9257142857142857,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.11690174788236618,
      "kl": 0.0006685256958007812,
      "learning_rate": 1.9733794420337212e-08,
      "loss": 0.0665,
      "num_tokens": 114660916.0,
      "reward": 0.34375,
      "reward_std": 0.34346264600753784,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4774521291255951,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3197.0,
      "completions/mean_length": 2787.822998046875,
      "completions/mean_terminated_length": 1400.199951171875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.928,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10178104043006897,
      "kl": 0.0007681846618652344,
      "learning_rate": 1.9539516087697516e-08,
      "loss": 0.03,
      "num_tokens": 114944339.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.12757760286331177,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3472.0,
      "completions/mean_length": 3340.135498046875,
      "completions/mean_terminated_length": 2023.2667236328125,
      "completions/min_length": 806.0,
      "completions/min_terminated_length": 806.0,
      "epoch": 0.9302857142857143,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10888979583978653,
      "kl": 0.0007346471150716146,
      "learning_rate": 1.9346966049016424e-08,
      "loss": 0.064,
      "num_tokens": 115281648.0,
      "reward": 0.2083333432674408,
      "reward_std": 0.24511615931987762,
      "rewards/format_reward/mean": 0.2083333283662796,
      "rewards/format_reward/std": 0.40824830532073975,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3355.0,
      "completions/mean_length": 3007.58349609375,
      "completions/mean_terminated_length": 1675.862060546875,
      "completions/min_length": 560.0,
      "completions/min_terminated_length": 560.0,
      "epoch": 0.9325714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11224064230918884,
      "kl": 0.0006457964579264323,
      "learning_rate": 1.9156153688911168e-08,
      "loss": 0.0819,
      "num_tokens": 115586336.0,
      "reward": 0.3125,
      "reward_std": 0.243794247508049,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3566.0,
      "completions/mean_length": 2656.28125,
      "completions/mean_terminated_length": 1357.4749755859375,
      "completions/min_length": 503.0,
      "completions/min_terminated_length": 503.0,
      "epoch": 0.9348571428571428,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13253681361675262,
      "kl": 0.0007356007893880209,
      "learning_rate": 1.8967088307307e-08,
      "loss": 0.0449,
      "num_tokens": 115858313.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3412.0,
      "completions/mean_length": 2570.791748046875,
      "completions/mean_terminated_length": 1598.938720703125,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.9371428571428572,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11055982112884521,
      "kl": 0.0006931622823079427,
      "learning_rate": 1.877977911898387e-08,
      "loss": 0.0235,
      "num_tokens": 116121279.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.25187548995018005,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3325.0,
      "completions/mean_length": 2561.8125,
      "completions/mean_terminated_length": 1450.7391357421875,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.9394285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12361273914575577,
      "kl": 0.0007197062174479166,
      "learning_rate": 1.8594235253127372e-08,
      "loss": 0.0484,
      "num_tokens": 116382873.0,
      "reward": 0.5416667461395264,
      "reward_std": 0.25731295347213745,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3556.0,
      "completions/mean_length": 2813.83349609375,
      "completions/mean_terminated_length": 1585.729736328125,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 512.0,
      "epoch": 0.9417142857142857,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11615951359272003,
      "kl": 0.000743865966796875,
      "learning_rate": 1.841046575288376e-08,
      "loss": 0.0666,
      "num_tokens": 116669063.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3359.0,
      "completions/mean_length": 2618.96875,
      "completions/mean_terminated_length": 1570.021728515625,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.944,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.1295614242553711,
      "kl": 0.0006841023763020834,
      "learning_rate": 1.822847957491922e-08,
      "loss": 0.0537,
      "num_tokens": 116936600.0,
      "reward": 0.5520833730697632,
      "reward_std": 0.3299438953399658,
      "rewards/format_reward/mean": 0.5520833134651184,
      "rewards/format_reward/std": 0.4998903274536133,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3509.0,
      "completions/mean_length": 2781.52099609375,
      "completions/mean_terminated_length": 1501.8919677734375,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 0.9462857142857143,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10412473976612091,
      "kl": 0.0008223851521809896,
      "learning_rate": 1.804828558898332e-08,
      "loss": 0.0331,
      "num_tokens": 117219448.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.1343369334936142,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3551.0,
      "completions/mean_length": 2655.20849609375,
      "completions/mean_terminated_length": 1602.5777587890625,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 0.9485714285714286,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12450996041297913,
      "kl": 0.0008185704549153646,
      "learning_rate": 1.786989257747672e-08,
      "loss": 0.0722,
      "num_tokens": 117490158.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.2566770315170288,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3497.0,
      "completions/mean_length": 2702.28125,
      "completions/mean_terminated_length": 1568.6429443359375,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 0.9508571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12505842745304108,
      "kl": 0.0007251103719075521,
      "learning_rate": 1.7693309235023126e-08,
      "loss": 0.0789,
      "num_tokens": 117765849.0,
      "reward": 0.46875,
      "reward_std": 0.22440215945243835,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3350.0,
      "completions/mean_length": 2607.354248046875,
      "completions/mean_terminated_length": 1351.666748046875,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 0.9531428571428572,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13574348390102386,
      "kl": 0.0007998148600260416,
      "learning_rate": 1.7518544168045523e-08,
      "loss": 0.0835,
      "num_tokens": 118032127.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.25187548995018005,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3561.0,
      "completions/mean_length": 2550.36474609375,
      "completions/mean_terminated_length": 1221.40478515625,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 0.9554285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.08835748583078384,
      "kl": 0.0007673899332682291,
      "learning_rate": 1.7345605894346727e-08,
      "loss": 0.0245,
      "num_tokens": 118292700.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.11077921837568283,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3562.0,
      "completions/mean_length": 2624.25,
      "completions/mean_terminated_length": 1221.5384521484375,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.9577142857142857,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.15390397608280182,
      "kl": 0.00080108642578125,
      "learning_rate": 1.717450284269421e-08,
      "loss": 0.0616,
      "num_tokens": 118561356.0,
      "reward": 0.4375,
      "reward_std": 0.1988866627216339,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3364.0,
      "completions/mean_length": 2631.77099609375,
      "completions/mean_terminated_length": 1407.4761962890625,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.96,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11759354919195175,
      "kl": 0.0007991790771484375,
      "learning_rate": 1.7005243352409332e-08,
      "loss": 0.0723,
      "num_tokens": 118830308.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3442.0,
      "completions/mean_length": 2744.541748046875,
      "completions/mean_terminated_length": 1405.9459228515625,
      "completions/min_length": 552.0,
      "completions/min_terminated_length": 552.0,
      "epoch": 0.9622857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12916982173919678,
      "kl": 0.0007009506225585938,
      "learning_rate": 1.6837835672960833e-08,
      "loss": 0.1051,
      "num_tokens": 119109948.0,
      "reward": 0.40625,
      "reward_std": 0.30638620257377625,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3468.0,
      "completions/mean_length": 2610.416748046875,
      "completions/mean_terminated_length": 1410.4185791015625,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.9645714285714285,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09457219392061234,
      "kl": 0.0007375081380208334,
      "learning_rate": 1.6672287963562854e-08,
      "loss": 0.0157,
      "num_tokens": 119376136.0,
      "reward": 0.5,
      "reward_std": 0.18404607474803925,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3403.0,
      "completions/mean_length": 2588.291748046875,
      "completions/mean_terminated_length": 1411.5455322265625,
      "completions/min_length": 487.0,
      "completions/min_terminated_length": 487.0,
      "epoch": 0.9668571428571429,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09154385328292847,
      "kl": 0.0007031758626302084,
      "learning_rate": 1.6508608292777203e-08,
      "loss": 0.0326,
      "num_tokens": 119640710.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.16666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3546.0,
      "completions/mean_length": 2996.447998046875,
      "completions/mean_terminated_length": 1569.5357666015625,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 0.9691428571428572,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.08817347139120102,
      "kl": 0.0007321039835611979,
      "learning_rate": 1.6346804638120097e-08,
      "loss": 0.0139,
      "num_tokens": 119945019.0,
      "reward": 0.3333333134651184,
      "reward_std": 0.1988866627216339,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3490.0,
      "completions/mean_length": 2831.479248046875,
      "completions/mean_terminated_length": 1821.9998779296875,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 0.9714285714285714,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11002353578805923,
      "kl": 0.0006011327107747396,
      "learning_rate": 1.6186884885673413e-08,
      "loss": 0.0259,
      "num_tokens": 120232825.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.26930975914001465,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3507.0,
      "completions/mean_length": 2616.33349609375,
      "completions/mean_terminated_length": 1519.64453125,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.9737142857142858,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12342694401741028,
      "kl": 0.0007171630859375,
      "learning_rate": 1.602885682970026e-08,
      "loss": 0.06,
      "num_tokens": 120499863.0,
      "reward": 0.5416666865348816,
      "reward_std": 0.28219255805015564,
      "rewards/format_reward/mean": 0.5416666865348816,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3290.0,
      "completions/mean_length": 2501.697998046875,
      "completions/mean_terminated_length": 1167.6976318359375,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 0.976,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09393564611673355,
      "kl": 0.000789642333984375,
      "learning_rate": 1.5872728172265146e-08,
      "loss": 0.0455,
      "num_tokens": 120756130.0,
      "reward": 0.46875,
      "reward_std": 0.16856960952281952,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3578.0,
      "completions/mean_length": 2545.83349609375,
      "completions/mean_terminated_length": 1369.2445068359375,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.9782857142857143,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09636654704809189,
      "kl": 0.0007244745890299479,
      "learning_rate": 1.571850652285857e-08,
      "loss": 0.0211,
      "num_tokens": 121016226.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.1585305631160736,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3367.0,
      "completions/mean_length": 2642.48974609375,
      "completions/mean_terminated_length": 1575.4444580078125,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 0.9805714285714285,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11780349910259247,
      "kl": 0.0007855097452799479,
      "learning_rate": 1.5566199398026148e-08,
      "loss": 0.0286,
      "num_tokens": 121284857.0,
      "reward": 0.5,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7916666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3425.0,
      "completions/mean_length": 2600.59375,
      "completions/mean_terminated_length": 1388.4884033203125,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 0.9828571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14266683161258698,
      "kl": 0.000858306884765625,
      "learning_rate": 1.5415814221002266e-08,
      "loss": 0.0254,
      "num_tokens": 121550366.0,
      "reward": 0.46875,
      "reward_std": 0.22440217435359955,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3494.0,
      "completions/mean_length": 3056.322998046875,
      "completions/mean_terminated_length": 1837.2069091796875,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 0.9851428571428571,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13045914471149445,
      "kl": 0.0007486343383789062,
      "learning_rate": 1.5267358321348288e-08,
      "loss": 0.0534,
      "num_tokens": 121859649.0,
      "reward": 0.3229166865348816,
      "reward_std": 0.28806596994400024,
      "rewards/format_reward/mean": 0.3229166567325592,
      "rewards/format_reward/std": 0.4700457453727722,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3398.0,
      "completions/mean_length": 2815.17724609375,
      "completions/mean_terminated_length": 1347.42431640625,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 0.9874285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1318049281835556,
      "kl": 0.0009895960489908855,
      "learning_rate": 1.5120838934595337e-08,
      "loss": 0.0314,
      "num_tokens": 122145320.0,
      "reward": 0.3541666865348816,
      "reward_std": 0.17532894015312195,
      "rewards/format_reward/mean": 0.3541666567325592,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3580.0,
      "completions/mean_length": 2879.77099609375,
      "completions/mean_terminated_length": 1471.3125,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 0.9897142857142858,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.12791548669338226,
      "kl": 0.0007193883260091146,
      "learning_rate": 1.4976263201891612e-08,
      "loss": 0.1038,
      "num_tokens": 122437462.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.28806596994400024,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3412.0,
      "completions/mean_length": 2634.229248046875,
      "completions/mean_terminated_length": 1413.09521484375,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.992,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13095231354236603,
      "kl": 0.0008246103922526041,
      "learning_rate": 1.483363816965435e-08,
      "loss": 0.0711,
      "num_tokens": 122705498.0,
      "reward": 0.5,
      "reward_std": 0.29962682723999023,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3562.0,
      "completions/mean_length": 2763.479248046875,
      "completions/mean_terminated_length": 1511.105224609375,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.9942857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10234726965427399,
      "kl": 0.0006532669067382812,
      "learning_rate": 1.469297078922642e-08,
      "loss": 0.0564,
      "num_tokens": 122986410.0,
      "reward": 0.4375,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.33333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3428.0,
      "completions/mean_length": 2980.33349609375,
      "completions/mean_terminated_length": 1773.0,
      "completions/min_length": 547.0,
      "completions/min_terminated_length": 547.0,
      "epoch": 0.9965714285714286,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.11328182369470596,
      "kl": 0.0007276535034179688,
      "learning_rate": 1.4554267916537493e-08,
      "loss": 0.0574,
      "num_tokens": 123289448.0,
      "reward": 0.375,
      "reward_std": 0.3044283986091614,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.4583333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3196.0,
      "completions/mean_length": 2099.27099609375,
      "completions/mean_terminated_length": 1168.16943359375,
      "completions/min_length": 240.0,
      "completions/min_terminated_length": 240.0,
      "epoch": 0.9988571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.130823016166687,
      "kl": 0.0008134841918945312,
      "learning_rate": 1.4417536311769885e-08,
      "loss": 0.0887,
      "num_tokens": 123506164.0,
      "reward": 0.6145833730697632,
      "reward_std": 0.22831778228282928,
      "rewards/format_reward/mean": 0.6145833134651184,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3551.0,
      "completions/mean_length": 2794.90625,
      "completions/mean_terminated_length": 1479.75,
      "completions/min_length": 541.0,
      "completions/min_terminated_length": 541.0,
      "epoch": 1.0022857142857142,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10344143956899643,
      "kl": 0.000720977783203125,
      "learning_rate": 1.4282782639029129e-08,
      "loss": 0.0621,
      "num_tokens": 123790399.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.1988866627216339,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3584.0,
      "completions/mean_length": 2578.5625,
      "completions/mean_terminated_length": 1439.066650390625,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 1.0045714285714287,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10817626118659973,
      "kl": 0.00086212158203125,
      "learning_rate": 1.4150013466019115e-08,
      "loss": 0.0487,
      "num_tokens": 124053823.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.6666666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3413.0,
      "completions/mean_length": 2653.84375,
      "completions/mean_terminated_length": 1351.625,
      "completions/min_length": 470.0,
      "completions/min_terminated_length": 470.0,
      "epoch": 1.006857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14167174696922302,
      "kl": 0.0007429122924804688,
      "learning_rate": 1.4019235263722034e-08,
      "loss": 0.0554,
      "num_tokens": 124324456.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.2996268570423126,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3550.0,
      "completions/mean_length": 2666.854248046875,
      "completions/mean_terminated_length": 1627.4222412109375,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 1.0091428571428571,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.1454349160194397,
      "kl": 0.0007111231486002604,
      "learning_rate": 1.3890454406082957e-08,
      "loss": 0.0507,
      "num_tokens": 124595780.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.34542036056518555,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.08333333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3391.0,
      "completions/mean_length": 3058.17724609375,
      "completions/mean_terminated_length": 1642.5001220703125,
      "completions/min_length": 600.0,
      "completions/min_terminated_length": 600.0,
      "epoch": 1.0114285714285713,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0833083987236023,
      "kl": 0.0007244745890299479,
      "learning_rate": 1.3763677169699217e-08,
      "loss": 0.0255,
      "num_tokens": 124905115.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.17728674411773682,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3451.0,
      "completions/mean_length": 2548.885498046875,
      "completions/mean_terminated_length": 1160.3170166015625,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 1.0137142857142858,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09871827065944672,
      "kl": 0.000720977783203125,
      "learning_rate": 1.3638909733514453e-08,
      "loss": 0.0578,
      "num_tokens": 125165906.0,
      "reward": 0.5,
      "reward_std": 0.2076037973165512,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3484.0,
      "completions/mean_length": 2776.33349609375,
      "completions/mean_terminated_length": 1737.90478515625,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 1.016,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.10498413443565369,
      "kl": 0.0008420944213867188,
      "learning_rate": 1.3516158178517482e-08,
      "loss": 0.0018,
      "num_tokens": 125448592.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.18208830058574677,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3555.0,
      "completions/mean_length": 2626.33349609375,
      "completions/mean_terminated_length": 1494.5455322265625,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 1.0182857142857142,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09597191214561462,
      "kl": 0.0006589889526367188,
      "learning_rate": 1.3395428487445914e-08,
      "loss": 0.0328,
      "num_tokens": 125716848.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.1613743156194687,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3397.0,
      "completions/mean_length": 2971.6875,
      "completions/mean_terminated_length": 1624.60009765625,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 1.0205714285714285,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.10853156447410583,
      "kl": 0.0006860097249348959,
      "learning_rate": 1.327672654449457e-08,
      "loss": 0.0547,
      "num_tokens": 126017958.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3578.0,
      "completions/mean_length": 2404.8125,
      "completions/mean_terminated_length": 1364.35302734375,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 1.022857142857143,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.10824251919984818,
      "kl": 0.0008029937744140625,
      "learning_rate": 1.316005813502869e-08,
      "loss": 0.0302,
      "num_tokens": 126264390.0,
      "reward": 0.53125,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3539.0,
      "completions/mean_length": 2996.34375,
      "completions/mean_terminated_length": 1494.5555419921875,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 1.0251428571428571,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11421850323677063,
      "kl": 0.0007171630859375,
      "learning_rate": 1.3045428945301954e-08,
      "loss": 0.0505,
      "num_tokens": 126568275.0,
      "reward": 0.3020833432674408,
      "reward_std": 0.25187548995018005,
      "rewards/format_reward/mean": 0.3020833432674408,
      "rewards/format_reward/std": 0.46157145500183105,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.3333333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3491.0,
      "completions/mean_length": 2175.30224609375,
      "completions/mean_terminated_length": 1169.08935546875,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 1.0274285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11491094529628754,
      "kl": 0.0010004043579101562,
      "learning_rate": 1.2932844562179351e-08,
      "loss": 0.0465,
      "num_tokens": 126793280.0,
      "reward": 0.6041666865348816,
      "reward_std": 0.21764282882213593,
      "rewards/format_reward/mean": 0.6041666865348816,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3553.0,
      "completions/mean_length": 2659.11474609375,
      "completions/mean_terminated_length": 1418.41455078125,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 1.0297142857142858,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10888926684856415,
      "kl": 0.0007184346516927084,
      "learning_rate": 1.2822310472864883e-08,
      "loss": 0.0329,
      "num_tokens": 127064491.0,
      "reward": 0.46875,
      "reward_std": 0.20084445178508759,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2865.0,
      "completions/mean_length": 2820.33349609375,
      "completions/mean_terminated_length": 1140.2667236328125,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 1.032,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09330493211746216,
      "kl": 0.0008147557576497396,
      "learning_rate": 1.2713832064634124e-08,
      "loss": 0.0562,
      "num_tokens": 127351083.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.17532894015312195,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3511.0,
      "completions/mean_length": 2383.77099609375,
      "completions/mean_terminated_length": 1232.5306396484375,
      "completions/min_length": 284.0,
      "completions/min_terminated_length": 284.0,
      "epoch": 1.0342857142857143,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12870053946971893,
      "kl": 0.00078582763671875,
      "learning_rate": 1.260741462457165e-08,
      "loss": 0.0507,
      "num_tokens": 127595621.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.2741113305091858,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3564.0,
      "completions/mean_length": 2839.89599609375,
      "completions/mean_terminated_length": 1419.3333740234375,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 1.0365714285714285,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.11888577044010162,
      "kl": 0.000759124755859375,
      "learning_rate": 1.2503063339313355e-08,
      "loss": 0.0509,
      "num_tokens": 127884727.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.2712675929069519,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3508.0,
      "completions/mean_length": 2449.48974609375,
      "completions/mean_terminated_length": 1266.7021484375,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 1.038857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11443246155977249,
      "kl": 0.0007734298706054688,
      "learning_rate": 1.2400783294793667e-08,
      "loss": 0.0354,
      "num_tokens": 128135298.0,
      "reward": 0.5,
      "reward_std": 0.19408512115478516,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3454.0,
      "completions/mean_length": 2976.09375,
      "completions/mean_terminated_length": 1571.6207275390625,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 1.0411428571428571,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.099580317735672,
      "kl": 0.0007212956746419271,
      "learning_rate": 1.2300579475997656e-08,
      "loss": 0.0599,
      "num_tokens": 128437083.0,
      "reward": 0.3125,
      "reward_std": 0.19408512115478516,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4659455418586731,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3349.0,
      "completions/mean_length": 2190.25,
      "completions/mean_terminated_length": 1010.923095703125,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 1.0434285714285714,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.05977644771337509,
      "kl": 0.000743865966796875,
      "learning_rate": 1.2202456766718091e-08,
      "loss": 0.0024,
      "num_tokens": 128662839.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.06650752574205399,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512423992157,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3560.0,
      "completions/mean_length": 2611.46875,
      "completions/mean_terminated_length": 1554.36962890625,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 1.0457142857142858,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09623143821954727,
      "kl": 0.0007073084513346354,
      "learning_rate": 1.2106419949317387e-08,
      "loss": -0.007,
      "num_tokens": 128929506.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.1430540680885315,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3443.0,
      "completions/mean_length": 2269.17724609375,
      "completions/mean_terminated_length": 954.3541870117188,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 1.048,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.10378686338663101,
      "kl": 0.0007356007893880209,
      "learning_rate": 1.2012473704494538e-08,
      "loss": 0.0184,
      "num_tokens": 129162539.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.11558075994253159,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2979.0,
      "completions/mean_length": 3132.71875,
      "completions/mean_terminated_length": 1521.0,
      "completions/min_length": 517.0,
      "completions/min_terminated_length": 517.0,
      "epoch": 1.0502857142857143,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.13919900357723236,
      "kl": 0.00072479248046875,
      "learning_rate": 1.1920622611056975e-08,
      "loss": 0.0613,
      "num_tokens": 129479390.0,
      "reward": 0.28125,
      "reward_std": 0.2841503620147705,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.45196935534477234,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3358.0,
      "completions/mean_length": 2476.572998046875,
      "completions/mean_terminated_length": 1414.346923828125,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 1.0525714285714285,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11369922757148743,
      "kl": 0.0008138020833333334,
      "learning_rate": 1.1830871145697411e-08,
      "loss": 0.0575,
      "num_tokens": 129733245.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.19604292511940002,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3547.0,
      "completions/mean_length": 2336.166748046875,
      "completions/mean_terminated_length": 1088.3333740234375,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 1.054857142857143,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09804533421993256,
      "kl": 0.0007867813110351562,
      "learning_rate": 1.174322368277565e-08,
      "loss": 0.0555,
      "num_tokens": 129973975.0,
      "reward": 0.53125,
      "reward_std": 0.14981341361999512,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3211.0,
      "completions/mean_length": 2410.416748046875,
      "completions/mean_terminated_length": 1236.8333740234375,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 1.0571428571428572,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.13963499665260315,
      "kl": 0.0008455912272135416,
      "learning_rate": 1.1657684494105385e-08,
      "loss": 0.0665,
      "num_tokens": 130221215.0,
      "reward": 0.53125,
      "reward_std": 0.20956161618232727,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.20833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3518.0,
      "completions/mean_length": 2983.822998046875,
      "completions/mean_terminated_length": 1597.2069091796875,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 1.0594285714285714,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0957186296582222,
      "kl": 0.0007654825846354166,
      "learning_rate": 1.1574257748745986e-08,
      "loss": 0.0327,
      "num_tokens": 130525020.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.1613743007183075,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3175.0,
      "completions/mean_length": 2496.75,
      "completions/mean_terminated_length": 1038.243896484375,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 1.0617142857142856,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.09010104835033417,
      "kl": 0.0008306503295898438,
      "learning_rate": 1.1492947512799328e-08,
      "loss": 0.0234,
      "num_tokens": 130781148.0,
      "reward": 0.447916716337204,
      "reward_std": 0.14109627902507782,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3473.0,
      "completions/mean_length": 2685.0,
      "completions/mean_terminated_length": 1479.0242919921875,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 1.064,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10904668271541595,
      "kl": 0.0007090568542480469,
      "learning_rate": 1.1413757749211601e-08,
      "loss": 0.0332,
      "num_tokens": 131054898.0,
      "reward": 0.5104166865348816,
      "reward_std": 0.23703491687774658,
      "rewards/format_reward/mean": 0.5104166865348816,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3584.0,
      "completions/mean_length": 2529.375,
      "completions/mean_terminated_length": 1709.111083984375,
      "completions/min_length": 545.0,
      "completions/min_terminated_length": 545.0,
      "epoch": 1.0662857142857143,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1257898211479187,
      "kl": 0.0007638931274414062,
      "learning_rate": 1.133669231758016e-08,
      "loss": 0.058,
      "num_tokens": 131312892.0,
      "reward": 0.6354166865348816,
      "reward_std": 0.3002627491950989,
      "rewards/format_reward/mean": 0.6354166865348816,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3306.0,
      "completions/mean_length": 2662.25,
      "completions/mean_terminated_length": 1477.1429443359375,
      "completions/min_length": 456.0,
      "completions/min_terminated_length": 456.0,
      "epoch": 1.0685714285714285,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.13364644348621368,
      "kl": 0.0008055369059244791,
      "learning_rate": 1.1261754973965421e-08,
      "loss": 0.1014,
      "num_tokens": 131584518.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.04166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2952.0,
      "completions/mean_length": 2969.385498046875,
      "completions/mean_terminated_length": 1223.8800048828125,
      "completions/min_length": 290.0,
      "completions/min_terminated_length": 290.0,
      "epoch": 1.070857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1145143136382103,
      "kl": 0.0007276535034179688,
      "learning_rate": 1.1188949370707786e-08,
      "loss": 0.0772,
      "num_tokens": 131885917.0,
      "reward": 0.2916666865348816,
      "reward_std": 0.23987865447998047,
      "rewards/format_reward/mean": 0.2916666567325592,
      "rewards/format_reward/std": 0.4569156765937805,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3372.0,
      "completions/mean_length": 2589.39599609375,
      "completions/mean_terminated_length": 1310.6190185546875,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 1.0731428571428572,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11043540388345718,
      "kl": 0.000732421875,
      "learning_rate": 1.1118279056249653e-08,
      "loss": 0.0381,
      "num_tokens": 132150159.0,
      "reward": 0.5,
      "reward_std": 0.2076038122177124,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3374.0,
      "completions/mean_length": 2956.89599609375,
      "completions/mean_terminated_length": 1759.697021484375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.0754285714285714,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09861429780721664,
      "kl": 0.0007677078247070312,
      "learning_rate": 1.1049747474962444e-08,
      "loss": 0.0378,
      "num_tokens": 132450659.0,
      "reward": 0.3645833432674408,
      "reward_std": 0.19080542027950287,
      "rewards/format_reward/mean": 0.3645833432674408,
      "rewards/format_reward/std": 0.4838397204875946,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3477.0,
      "completions/mean_length": 2792.86474609375,
      "completions/mean_terminated_length": 1585.3421630859375,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 1.0777142857142856,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.1034030020236969,
      "kl": 0.0007429122924804688,
      "learning_rate": 1.0983357966978745e-08,
      "loss": 0.0572,
      "num_tokens": 132734950.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.2741113305091858,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.875,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3517.0,
      "completions/mean_length": 2577.30224609375,
      "completions/mean_terminated_length": 1436.3778076171875,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 1.08,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12331518530845642,
      "kl": 0.0008172988891601562,
      "learning_rate": 1.0919113768029517e-08,
      "loss": 0.0361,
      "num_tokens": 132998463.0,
      "reward": 0.5,
      "reward_std": 0.21764282882213593,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5026246905326843,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 2995.0,
      "completions/mean_length": 2257.385498046875,
      "completions/mean_terminated_length": 1134.865478515625,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 1.0822857142857143,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.14478808641433716,
      "kl": 0.0008268356323242188,
      "learning_rate": 1.0857018009286381e-08,
      "loss": 0.0633,
      "num_tokens": 133231096.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3297.0,
      "completions/mean_length": 2741.822998046875,
      "completions/mean_terminated_length": 1134.0303955078125,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 1.0845714285714285,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.14106322824954987,
      "kl": 0.0009167989095052084,
      "learning_rate": 1.0797073717209012e-08,
      "loss": 0.0717,
      "num_tokens": 133510343.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.2941893935203552,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3558.0,
      "completions/mean_length": 2573.041748046875,
      "completions/mean_terminated_length": 1562.0833740234375,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 1.0868571428571427,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.13431628048419952,
      "kl": 0.0008484522501627604,
      "learning_rate": 1.0739283813397638e-08,
      "loss": 0.0011,
      "num_tokens": 133773627.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.19408512115478516,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3459.0,
      "completions/mean_length": 2703.70849609375,
      "completions/mean_terminated_length": 1236.5555419921875,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 1.0891428571428572,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09678907692432404,
      "kl": 0.00064849853515625,
      "learning_rate": 1.068365111445064e-08,
      "loss": 0.0249,
      "num_tokens": 134049515.0,
      "reward": 0.40625,
      "reward_std": 0.12234009802341461,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3427.0,
      "completions/mean_length": 2804.916748046875,
      "completions/mean_terminated_length": 1506.4444580078125,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 1.0914285714285714,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.08967314660549164,
      "kl": 0.0007128715515136719,
      "learning_rate": 1.063017833182728e-08,
      "loss": 0.0376,
      "num_tokens": 134334663.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.2163209319114685,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3474.0,
      "completions/mean_length": 2728.822998046875,
      "completions/mean_terminated_length": 1365.1622314453125,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 1.0937142857142856,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.11985599249601364,
      "kl": 0.0007877349853515625,
      "learning_rate": 1.0578868071715544e-08,
      "loss": 0.0121,
      "num_tokens": 134612590.0,
      "reward": 0.3958333432674408,
      "reward_std": 0.20280227065086365,
      "rewards/format_reward/mean": 0.3958333432674408,
      "rewards/format_reward/std": 0.4915960133075714,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3489.0,
      "completions/mean_length": 2677.479248046875,
      "completions/mean_terminated_length": 1231.9459228515625,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 1.096,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.1189078688621521,
      "kl": 0.0007257461547851562,
      "learning_rate": 1.0529722834905124e-08,
      "loss": 0.0703,
      "num_tokens": 134885042.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.20956161618232727,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3549.0,
      "completions/mean_length": 2686.36474609375,
      "completions/mean_terminated_length": 1190.3055419921875,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 1.0982857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0946437418460846,
      "kl": 0.0006910959879557291,
      "learning_rate": 1.0482745016665526e-08,
      "loss": 0.0257,
      "num_tokens": 135160339.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.12234010547399521,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9583333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3202.0,
      "completions/mean_length": 2500.46875,
      "completions/mean_terminated_length": 1370.8297119140625,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 1.1005714285714285,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.09965284168720245,
      "kl": 0.0007975896199544271,
      "learning_rate": 1.0437936906629333e-08,
      "loss": 0.0608,
      "num_tokens": 135415948.0,
      "reward": 0.5208333730697632,
      "reward_std": 0.21240532398223877,
      "rewards/format_reward/mean": 0.5208333134651184,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3085.0,
      "completions/mean_length": 2721.40625,
      "completions/mean_terminated_length": 1404.8157958984375,
      "completions/min_length": 531.0,
      "completions/min_terminated_length": 531.0,
      "epoch": 1.1028571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12250243872404099,
      "kl": 0.0008351008097330729,
      "learning_rate": 1.0395300688680625e-08,
      "loss": 0.0288,
      "num_tokens": 135693367.0,
      "reward": 0.4270833432674408,
      "reward_std": 0.23311930894851685,
      "rewards/format_reward/mean": 0.4270833432674408,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.375,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3527.0,
      "completions/mean_length": 2201.854248046875,
      "completions/mean_terminated_length": 1256.1754150390625,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 1.1051428571428572,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09815461933612823,
      "kl": 0.0008169809977213541,
      "learning_rate": 1.0354838440848502e-08,
      "loss": 0.0174,
      "num_tokens": 135919343.0,
      "reward": 0.6458333730697632,
      "reward_std": 0.11077921092510223,
      "rewards/format_reward/mean": 0.6458333134651184,
      "rewards/format_reward/std": 0.4807705879211426,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0416666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3390.0,
      "completions/mean_length": 2413.447998046875,
      "completions/mean_terminated_length": 1290.6734619140625,
      "completions/min_length": 317.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 1.1074285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10431455820798874,
      "kl": 0.0007483164469401041,
      "learning_rate": 1.0316552135205838e-08,
      "loss": 0.0223,
      "num_tokens": 136167132.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.2379208505153656,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.625,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3554.0,
      "completions/mean_length": 2874.67724609375,
      "completions/mean_terminated_length": 1837.974365234375,
      "completions/min_length": 591.0,
      "completions/min_terminated_length": 591.0,
      "epoch": 1.1097142857142857,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.09176606684923172,
      "kl": 0.0006389617919921875,
      "learning_rate": 1.0280443637773164e-08,
      "loss": 0.0483,
      "num_tokens": 136460291.0,
      "reward": 0.4479166865348816,
      "reward_std": 0.18208828568458557,
      "rewards/format_reward/mean": 0.4479166567325592,
      "rewards/format_reward/std": 0.49989035725593567,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2083333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3410.0,
      "completions/mean_length": 2256.104248046875,
      "completions/mean_terminated_length": 1178.7547607421875,
      "completions/min_length": 361.0,
      "completions/min_terminated_length": 361.0,
      "epoch": 1.112,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 0.16725574433803558,
      "kl": 0.0009609858194986979,
      "learning_rate": 1.0246514708427701e-08,
      "loss": 0.0685,
      "num_tokens": 136692033.0,
      "reward": 0.5937500596046448,
      "reward_std": 0.32122674584388733,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.29166666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3506.0,
      "completions/mean_length": 3046.80224609375,
      "completions/mean_terminated_length": 1920.4193115234375,
      "completions/min_length": 607.0,
      "completions/min_terminated_length": 607.0,
      "epoch": 1.1142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.14524559676647186,
      "kl": 0.000804901123046875,
      "learning_rate": 1.0214767000817596e-08,
      "loss": 0.0883,
      "num_tokens": 137000780.0,
      "reward": 0.3333333432674408,
      "reward_std": 0.3419407308101654,
      "rewards/format_reward/mean": 0.3333333432674408,
      "rewards/format_reward/std": 0.4738790988922119,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.0833333333333335,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3573.0,
      "completions/mean_length": 2507.23974609375,
      "completions/mean_terminated_length": 1516.6199951171875,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 1.1165714285714285,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.14165270328521729,
      "kl": 0.0008672078450520834,
      "learning_rate": 1.0185202062281335e-08,
      "loss": 0.0481,
      "num_tokens": 137256997.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.3103017807006836,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972511827945709,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.7083333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3275.0,
      "completions/mean_length": 2566.15625,
      "completions/mean_terminated_length": 1200.756103515625,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 1.1188571428571428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.10389388352632523,
      "kl": 0.0008487701416015625,
      "learning_rate": 1.0157821333772304e-08,
      "loss": 0.031,
      "num_tokens": 137518762.0,
      "reward": 0.4375,
      "reward_std": 0.10206207633018494,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.45833333333333326,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3457.0,
      "completions/mean_length": 2818.52099609375,
      "completions/mean_terminated_length": 1484.4000244140625,
      "completions/min_length": 558.0,
      "completions/min_terminated_length": 558.0,
      "epoch": 1.1211428571428572,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10865319520235062,
      "kl": 0.0007346471150716146,
      "learning_rate": 1.0132626149788589e-08,
      "loss": 0.0265,
      "num_tokens": 137806122.0,
      "reward": 0.3854166865348816,
      "reward_std": 0.21436314284801483,
      "rewards/format_reward/mean": 0.3854166567325592,
      "rewards/format_reward/std": 0.4892484247684479,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.1666666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3546.0,
      "completions/mean_length": 2398.77099609375,
      "completions/mean_terminated_length": 1395.8846435546875,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "epoch": 1.1234285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1486992985010147,
      "kl": 0.0010083516438802083,
      "learning_rate": 1.0109617738307912e-08,
      "loss": 0.025,
      "num_tokens": 138051566.0,
      "reward": 0.5729166865348816,
      "reward_std": 0.22440217435359955,
      "rewards/format_reward/mean": 0.5729166865348816,
      "rewards/format_reward/std": 0.4972512125968933,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.2916666666666665,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3512.0,
      "completions/mean_length": 2436.635498046875,
      "completions/mean_terminated_length": 1581.3272705078125,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 1.1257142857142857,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.119545117020607,
      "kl": 0.0006758371988932291,
      "learning_rate": 1.008879722072778e-08,
      "loss": 0.0611,
      "num_tokens": 138301035.0,
      "reward": 0.625,
      "reward_std": 0.25863486528396606,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.4866642653942108,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5416666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3578.0,
      "completions/mean_length": 2884.760498046875,
      "completions/mean_terminated_length": 1769.7568359375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 1.1280000000000001,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.14157073199748993,
      "kl": 0.0008312861124674479,
      "learning_rate": 1.0070165611810855e-08,
      "loss": 0.0592,
      "num_tokens": 138594076.0,
      "reward": 0.40625,
      "reward_std": 0.2653941810131073,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.9166666666666667,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3542.0,
      "completions/mean_length": 2582.59375,
      "completions/mean_terminated_length": 1494.1087646484375,
      "completions/min_length": 314.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 1.1302857142857143,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.12795785069465637,
      "kl": 0.0007610321044921875,
      "learning_rate": 1.0053723819635471e-08,
      "loss": -0.0032,
      "num_tokens": 138857653.0,
      "reward": 0.4791666865348816,
      "reward_std": 0.1988866627216339,
      "rewards/format_reward/mean": 0.4791666567325592,
      "rewards/format_reward/std": 0.5021882057189941,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3286.0,
      "completions/mean_length": 2581.46875,
      "completions/mean_terminated_length": 1396.6591796875,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 1.1325714285714286,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.12635618448257446,
      "kl": 0.0006399154663085938,
      "learning_rate": 1.0039472645551372e-08,
      "loss": 0.027,
      "num_tokens": 139121680.0,
      "reward": 0.46875,
      "reward_std": 0.25187548995018005,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5016420483589172,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.75,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3185.0,
      "completions/mean_length": 2634.80224609375,
      "completions/mean_terminated_length": 1414.40478515625,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 1.1348571428571428,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.10370950400829315,
      "kl": 0.0008077621459960938,
      "learning_rate": 1.002741278414069e-08,
      "loss": 0.064,
      "num_tokens": 139390827.0,
      "reward": 0.4583333432674408,
      "reward_std": 0.20280227065086365,
      "rewards/format_reward/mean": 0.4583333432674408,
      "rewards/format_reward/std": 0.5008764266967773,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.5833333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3311.0,
      "completions/mean_length": 2563.65625,
      "completions/mean_terminated_length": 1006.2894897460938,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 1.1371428571428572,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.11594365537166595,
      "kl": 0.0008608500162760416,
      "learning_rate": 1.0017544823184054e-08,
      "loss": 0.0579,
      "num_tokens": 139652034.0,
      "reward": 0.4375,
      "reward_std": 0.16661179065704346,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.4986824691295624,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.41666666666666674,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3412.0,
      "completions/mean_length": 2895.83349609375,
      "completions/mean_terminated_length": 1640.941162109375,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 1.1394285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.15313191711902618,
      "kl": 0.0007664362589518229,
      "learning_rate": 1.0009869243631952e-08,
      "loss": 0.1271,
      "num_tokens": 139946336.0,
      "reward": 0.40625,
      "reward_std": 0.3622187376022339,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.4937104284763336,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -0.8333333333333333,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3562.0,
      "completions/mean_length": 2671.0,
      "completions/mean_terminated_length": 1592.0,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 1.1417142857142857,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 0.1362977772951126,
      "kl": 0.0007994969685872396,
      "learning_rate": 1.000438641958131e-08,
      "loss": 0.0793,
      "num_tokens": 140218496.0,
      "reward": 0.4895833432674408,
      "reward_std": 0.29766905307769775,
      "rewards/format_reward/mean": 0.4895833432674408,
      "rewards/format_reward/std": 0.5025155544281006,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -1.25,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3491.0,
      "completions/mean_length": 2254.0625,
      "completions/mean_terminated_length": 1219.6666259765625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 1.144,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.112995445728302,
      "kl": 0.0009085337320963541,
      "learning_rate": 1.0001096618257237e-08,
      "loss": 0.0594,
      "num_tokens": 140449730.0,
      "reward": 0.5833333730697632,
      "reward_std": 0.17532894015312195,
      "rewards/format_reward/mean": 0.5833333134651184,
      "rewards/format_reward/std": 0.4955945909023285,
      "step": 500
    },
    {
      "epoch": 1.144,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.04486549567896873,
      "train_runtime": 27119.3128,
      "train_samples_per_second": 1.77,
      "train_steps_per_second": 0.018
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 140449730,
  "num_train_epochs": 2,
  "save_steps": 250,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}