{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9986564064981067,
  "eval_steps": 500,
  "global_step": 511,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7912.0,
      "completions/max_terminated_length": 7912.0,
      "completions/mean_length": 617.13671875,
      "completions/mean_terminated_length": 617.13671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.0019543178209356295,
      "grad_norm": 0.5497008291481603,
      "learning_rate": 0.0,
      "loss": -0.0172,
      "num_tokens": 381814.0,
      "reward": 0.06640625,
      "reward_std": 0.16296617686748505,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.01953125,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5592.0,
      "completions/max_terminated_length": 5592.0,
      "completions/mean_length": 567.18359375,
      "completions/mean_terminated_length": 567.18359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.003908635641871259,
      "grad_norm": 2.698078274122091,
      "learning_rate": 1.923076923076923e-08,
      "loss": -0.0685,
      "num_tokens": 747460.0,
      "reward": 0.0869140625,
      "reward_std": 0.219474196434021,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.025390625,
      "rewards/soft_format_reward/std": 0.15746226906776428,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4418.0,
      "completions/max_terminated_length": 4418.0,
      "completions/mean_length": 513.427734375,
      "completions/mean_terminated_length": 513.427734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.005862953462806889,
      "grad_norm": 5.308384935213732,
      "learning_rate": 3.846153846153846e-08,
      "loss": -0.0673,
      "num_tokens": 1071487.0,
      "reward": 0.0888671875,
      "reward_std": 0.185697540640831,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.021484375,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7199.0,
      "completions/max_terminated_length": 7199.0,
      "completions/mean_length": 565.57421875,
      "completions/mean_terminated_length": 565.57421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.007817271283742518,
      "grad_norm": 0.48385115003594786,
      "learning_rate": 5.7692307692307695e-08,
      "loss": -0.0395,
      "num_tokens": 1422405.0,
      "reward": 0.06640625,
      "reward_std": 0.16206470131874084,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.0078125,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5413.0,
      "completions/max_terminated_length": 5413.0,
      "completions/mean_length": 561.673828125,
      "completions/mean_terminated_length": 561.673828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.009771589104678149,
      "grad_norm": 0.381530031845912,
      "learning_rate": 7.692307692307692e-08,
      "loss": -0.0071,
      "num_tokens": 1779006.0,
      "reward": 0.0263671875,
      "reward_std": 0.0822526216506958,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.021484375,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7300.0,
      "completions/max_terminated_length": 7300.0,
      "completions/mean_length": 612.404296875,
      "completions/mean_terminated_length": 612.404296875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.011725906925613778,
      "grad_norm": 0.5102114685216445,
      "learning_rate": 9.615384615384616e-08,
      "loss": -0.0678,
      "num_tokens": 2159213.0,
      "reward": 0.0498046875,
      "reward_std": 0.12858566641807556,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.017578125,
      "rewards/soft_format_reward/std": 0.13154059648513794,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3591.0,
      "completions/max_terminated_length": 3591.0,
      "completions/mean_length": 516.505859375,
      "completions/mean_terminated_length": 516.505859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.013680224746549407,
      "grad_norm": 0.6048195767923973,
      "learning_rate": 1.1538461538461539e-07,
      "loss": -0.0439,
      "num_tokens": 2491552.0,
      "reward": 0.056640625,
      "reward_std": 0.11580279469490051,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.0234375,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 5877.0,
      "completions/max_terminated_length": 5877.0,
      "completions/mean_length": 526.783203125,
      "completions/mean_terminated_length": 528.8490600585938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.015634542567485036,
      "grad_norm": 0.6144749886453214,
      "learning_rate": 1.346153846153846e-07,
      "loss": -0.037,
      "num_tokens": 2833313.0,
      "reward": 0.029296875,
      "reward_std": 0.09001073986291885,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.015625,
      "rewards/soft_format_reward/std": 0.12414088100194931,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4974.0,
      "completions/max_terminated_length": 4974.0,
      "completions/mean_length": 525.9140625,
      "completions/mean_terminated_length": 525.9140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.01758886038842067,
      "grad_norm": 0.7578893167945104,
      "learning_rate": 1.5384615384615385e-07,
      "loss": -0.0363,
      "num_tokens": 3176469.0,
      "reward": 0.0400390625,
      "reward_std": 0.11926878988742828,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.025390625,
      "rewards/soft_format_reward/std": 0.15746226906776428,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5875.0,
      "completions/max_terminated_length": 5875.0,
      "completions/mean_length": 571.982421875,
      "completions/mean_terminated_length": 571.982421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.019543178209356298,
      "grad_norm": 0.505459854966566,
      "learning_rate": 1.7307692307692305e-07,
      "loss": -0.0648,
      "num_tokens": 3538620.0,
      "reward": 0.03515625,
      "reward_std": 0.104578398168087,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.01953125,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 5900.0,
      "completions/max_terminated_length": 5900.0,
      "completions/mean_length": 553.423828125,
      "completions/mean_terminated_length": 554.5068359375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.021497496030291927,
      "grad_norm": 0.6386183497474042,
      "learning_rate": 1.9230769230769231e-07,
      "loss": -0.0297,
      "num_tokens": 3893797.0,
      "reward": 0.0400390625,
      "reward_std": 0.107000432908535,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.017578125,
      "rewards/soft_format_reward/std": 0.13154059648513794,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6139.0,
      "completions/max_terminated_length": 6139.0,
      "completions/mean_length": 554.06640625,
      "completions/mean_terminated_length": 554.06640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.023451813851227556,
      "grad_norm": 0.5476698283725961,
      "learning_rate": 2.1153846153846152e-07,
      "loss": -0.0111,
      "num_tokens": 4237223.0,
      "reward": 0.05078125,
      "reward_std": 0.12995371222496033,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.0234375,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8168.0,
      "completions/max_terminated_length": 8168.0,
      "completions/mean_length": 567.6796875,
      "completions/mean_terminated_length": 567.6796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.025406131672163185,
      "grad_norm": 0.6283143368465841,
      "learning_rate": 2.3076923076923078e-07,
      "loss": -0.0371,
      "num_tokens": 4601331.0,
      "reward": 0.04296875,
      "reward_std": 0.1183362603187561,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.01953125,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 3741.0,
      "completions/max_terminated_length": 3741.0,
      "completions/mean_length": 513.072265625,
      "completions/mean_terminated_length": 514.0762939453125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.027360449493098814,
      "grad_norm": 3.1847469546145972,
      "learning_rate": 2.5e-07,
      "loss": -0.0305,
      "num_tokens": 4934456.0,
      "reward": 0.05859375,
      "reward_std": 0.15280833840370178,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.01953125,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 6554.0,
      "completions/max_terminated_length": 6554.0,
      "completions/mean_length": 608.732421875,
      "completions/mean_terminated_length": 609.9236450195312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.029314767314034446,
      "grad_norm": 0.5723525274669647,
      "learning_rate": 2.692307692307692e-07,
      "loss": -0.0246,
      "num_tokens": 5323775.0,
      "reward": 0.07421875,
      "reward_std": 0.15633760392665863,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.03125,
      "rewards/soft_format_reward/std": 0.17416280508041382,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 7342.0,
      "completions/max_terminated_length": 7342.0,
      "completions/mean_length": 609.19140625,
      "completions/mean_terminated_length": 611.5804443359375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.03126908513497007,
      "grad_norm": 0.4570532944391167,
      "learning_rate": 2.884615384615384e-07,
      "loss": 0.018,
      "num_tokens": 5713905.0,
      "reward": 0.0537109375,
      "reward_std": 0.12987464666366577,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.025390625,
      "rewards/soft_format_reward/std": 0.15746226906776428,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6857.0,
      "completions/max_terminated_length": 6857.0,
      "completions/mean_length": 606.8046875,
      "completions/mean_terminated_length": 606.8046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.0332234029559057,
      "grad_norm": 0.7214971252015325,
      "learning_rate": 3.076923076923077e-07,
      "loss": -0.028,
      "num_tokens": 6092541.0,
      "reward": 0.0693359375,
      "reward_std": 0.15594926476478577,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.029296875,
      "rewards/soft_format_reward/std": 0.16880230605602264,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6450.0,
      "completions/max_terminated_length": 6450.0,
      "completions/mean_length": 570.732421875,
      "completions/mean_terminated_length": 570.732421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.03517772077684134,
      "grad_norm": 0.8363578457680537,
      "learning_rate": 3.269230769230769e-07,
      "loss": -0.0084,
      "num_tokens": 6452228.0,
      "reward": 0.0400390625,
      "reward_std": 0.13050393760204315,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.033203125,
      "rewards/soft_format_reward/std": 0.17934183776378632,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 7415.0,
      "completions/max_terminated_length": 7415.0,
      "completions/mean_length": 660.81640625,
      "completions/mean_terminated_length": 663.4078979492188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.037132038597776966,
      "grad_norm": 1.8442167181929248,
      "learning_rate": 3.461538461538461e-07,
      "loss": -0.0545,
      "num_tokens": 6862838.0,
      "reward": 0.048828125,
      "reward_std": 0.14615076780319214,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.046875,
      "rewards/soft_format_reward/std": 0.21157780289649963,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3138.0,
      "completions/max_terminated_length": 3138.0,
      "completions/mean_length": 499.787109375,
      "completions/mean_terminated_length": 499.787109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.039086356418712595,
      "grad_norm": 15.273262941735497,
      "learning_rate": 3.6538461538461534e-07,
      "loss": -0.0211,
      "num_tokens": 7198249.0,
      "reward": 0.078125,
      "reward_std": 0.2060895711183548,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.05078125,
      "rewards/soft_format_reward/std": 0.21976542472839355,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 5856.0,
      "completions/max_terminated_length": 5856.0,
      "completions/mean_length": 687.306640625,
      "completions/mean_terminated_length": 688.6516723632812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.041040674239648224,
      "grad_norm": 5.048231501526631,
      "learning_rate": 3.8461538461538463e-07,
      "loss": -0.0082,
      "num_tokens": 7618006.0,
      "reward": 0.10546875,
      "reward_std": 0.22821679711341858,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.0625,
      "rewards/soft_format_reward/std": 0.2422981858253479,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6152.0,
      "completions/max_terminated_length": 6152.0,
      "completions/mean_length": 708.736328125,
      "completions/mean_terminated_length": 708.736328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.04299499206058385,
      "grad_norm": 0.4512712553803841,
      "learning_rate": 4.0384615384615386e-07,
      "loss": 0.0034,
      "num_tokens": 8046383.0,
      "reward": 0.07421875,
      "reward_std": 0.17045770585536957,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.04296875,
      "rewards/soft_format_reward/std": 0.2029850035905838,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 5840.0,
      "completions/max_terminated_length": 5840.0,
      "completions/mean_length": 687.8046875,
      "completions/mean_terminated_length": 689.1506958007812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.04494930988151948,
      "grad_norm": 2.43420844870162,
      "learning_rate": 4.2307692307692304e-07,
      "loss": -0.0204,
      "num_tokens": 8472651.0,
      "reward": 0.087890625,
      "reward_std": 0.19242921471595764,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.05078125,
      "rewards/soft_format_reward/std": 0.21976542472839355,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7117.0,
      "completions/max_terminated_length": 7117.0,
      "completions/mean_length": 681.640625,
      "completions/mean_terminated_length": 681.640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.04690362770245511,
      "grad_norm": 1.4085772264361187,
      "learning_rate": 4.423076923076923e-07,
      "loss": -0.0008,
      "num_tokens": 8892259.0,
      "reward": 0.0849609375,
      "reward_std": 0.18828873336315155,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.056640625,
      "rewards/soft_format_reward/std": 0.23138070106506348,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3989.0,
      "completions/max_terminated_length": 3989.0,
      "completions/mean_length": 655.474609375,
      "completions/mean_terminated_length": 655.474609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.04885794552339074,
      "grad_norm": 0.7352936977408676,
      "learning_rate": 4.6153846153846156e-07,
      "loss": -0.0287,
      "num_tokens": 9297254.0,
      "reward": 0.0673828125,
      "reward_std": 0.16342398524284363,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.068359375,
      "rewards/soft_format_reward/std": 0.25260838866233826,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6753.0,
      "completions/max_terminated_length": 6753.0,
      "completions/mean_length": 654.60546875,
      "completions/mean_terminated_length": 654.60546875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.05081226334432637,
      "grad_norm": 1.556772523178416,
      "learning_rate": 4.807692307692307e-07,
      "loss": 0.0177,
      "num_tokens": 9702892.0,
      "reward": 0.087890625,
      "reward_std": 0.2010476142168045,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.078125,
      "rewards/soft_format_reward/std": 0.26863065361976624,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6472.0,
      "completions/max_terminated_length": 6472.0,
      "completions/mean_length": 736.044921875,
      "completions/mean_terminated_length": 736.044921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.052766581165262,
      "grad_norm": 1.0582490484565645,
      "learning_rate": 5e-07,
      "loss": -0.0134,
      "num_tokens": 10148339.0,
      "reward": 0.1044921875,
      "reward_std": 0.20019401609897614,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.087890625,
      "rewards/soft_format_reward/std": 0.2834126651287079,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 5664.0,
      "completions/max_terminated_length": 5664.0,
      "completions/mean_length": 726.126953125,
      "completions/mean_terminated_length": 727.5479125976562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.05472089898619763,
      "grad_norm": 2.857726402173463,
      "learning_rate": 5.192307692307692e-07,
      "loss": -0.0361,
      "num_tokens": 10592276.0,
      "reward": 0.1015625,
      "reward_std": 0.20877701044082642,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.12109375,
      "rewards/soft_format_reward/std": 0.3265552520751953,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 7747.0,
      "completions/max_terminated_length": 7747.0,
      "completions/mean_length": 732.8671875,
      "completions/mean_terminated_length": 735.7412109375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.056675216807133263,
      "grad_norm": 3.8611447120870404,
      "learning_rate": 5.384615384615384e-07,
      "loss": 0.0596,
      "num_tokens": 11039728.0,
      "reward": 0.1279296875,
      "reward_std": 0.2408362179994583,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.158203125,
      "rewards/soft_format_reward/std": 0.36528825759887695,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6036.0,
      "completions/max_terminated_length": 6036.0,
      "completions/mean_length": 728.291015625,
      "completions/mean_terminated_length": 728.291015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.05862953462806889,
      "grad_norm": 6.140510129844352,
      "learning_rate": 5.576923076923077e-07,
      "loss": 0.0455,
      "num_tokens": 11479781.0,
      "reward": 0.1630859375,
      "reward_std": 0.25149115920066833,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.189453125,
      "rewards/soft_format_reward/std": 0.3922513723373413,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5456.0,
      "completions/max_terminated_length": 5456.0,
      "completions/mean_length": 623.748046875,
      "completions/mean_terminated_length": 623.748046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.06058385244900452,
      "grad_norm": 20.190316320088844,
      "learning_rate": 5.769230769230768e-07,
      "loss": 0.0265,
      "num_tokens": 11866708.0,
      "reward": 0.1708984375,
      "reward_std": 0.2567231357097626,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.208984375,
      "rewards/soft_format_reward/std": 0.40698084235191345,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5453.0,
      "completions/max_terminated_length": 5453.0,
      "completions/mean_length": 609.91015625,
      "completions/mean_terminated_length": 609.91015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.06253817026994014,
      "grad_norm": 9.473332784285086,
      "learning_rate": 5.961538461538461e-07,
      "loss": -0.0105,
      "num_tokens": 12249206.0,
      "reward": 0.2158203125,
      "reward_std": 0.3083522915840149,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.208984375,
      "rewards/soft_format_reward/std": 0.40698084235191345,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5294.0,
      "completions/max_terminated_length": 5294.0,
      "completions/mean_length": 664.775390625,
      "completions/mean_terminated_length": 664.775390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.06449248809087578,
      "grad_norm": 10.829803763505799,
      "learning_rate": 6.153846153846154e-07,
      "loss": 0.0103,
      "num_tokens": 12661043.0,
      "reward": 0.1640625,
      "reward_std": 0.26412612199783325,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.20703125,
      "rewards/soft_format_reward/std": 0.40557438135147095,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7671.0,
      "completions/max_terminated_length": 7671.0,
      "completions/mean_length": 666.4140625,
      "completions/mean_terminated_length": 666.4140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.0664468059118114,
      "grad_norm": 1.1974288910365694,
      "learning_rate": 6.346153846153845e-07,
      "loss": 0.0445,
      "num_tokens": 13073399.0,
      "reward": 0.2412109375,
      "reward_std": 0.33153459429740906,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.283203125,
      "rewards/soft_format_reward/std": 0.4509948492050171,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4454.0,
      "completions/max_terminated_length": 4454.0,
      "completions/mean_length": 571.095703125,
      "completions/mean_terminated_length": 571.095703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.06840112373274704,
      "grad_norm": 13.626751077969622,
      "learning_rate": 6.538461538461538e-07,
      "loss": 0.01,
      "num_tokens": 13440776.0,
      "reward": 0.224609375,
      "reward_std": 0.2904402017593384,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.3125,
      "rewards/soft_format_reward/std": 0.4639657139778137,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 2804.0,
      "completions/max_terminated_length": 2804.0,
      "completions/mean_length": 640.6328125,
      "completions/mean_terminated_length": 641.886474609375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.07035544155368267,
      "grad_norm": 1.819852103878187,
      "learning_rate": 6.730769230769231e-07,
      "loss": 0.0503,
      "num_tokens": 13846364.0,
      "reward": 0.2392578125,
      "reward_std": 0.311190664768219,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.384765625,
      "rewards/soft_format_reward/std": 0.4870156943798065,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6238.0,
      "completions/max_terminated_length": 6238.0,
      "completions/mean_length": 639.701171875,
      "completions/mean_terminated_length": 639.701171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.0723097593746183,
      "grad_norm": 1.4235130554682933,
      "learning_rate": 6.923076923076922e-07,
      "loss": 0.0002,
      "num_tokens": 14248707.0,
      "reward": 0.2451171875,
      "reward_std": 0.29699093103408813,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.423828125,
      "rewards/soft_format_reward/std": 0.4946470856666565,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7398.0,
      "completions/max_terminated_length": 7398.0,
      "completions/mean_length": 590.271484375,
      "completions/mean_terminated_length": 590.271484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.07426407719555393,
      "grad_norm": 0.628319132660879,
      "learning_rate": 7.115384615384616e-07,
      "loss": 0.0754,
      "num_tokens": 14623614.0,
      "reward": 0.29296875,
      "reward_std": 0.2848966419696808,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.484375,
      "rewards/soft_format_reward/std": 0.5002445578575134,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5965.0,
      "completions/max_terminated_length": 5965.0,
      "completions/mean_length": 613.828125,
      "completions/mean_terminated_length": 613.828125,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.07621839501648955,
      "grad_norm": 0.7613404241291125,
      "learning_rate": 7.307692307692307e-07,
      "loss": 0.0549,
      "num_tokens": 15010438.0,
      "reward": 0.3310546875,
      "reward_std": 0.2927667796611786,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.572265625,
      "rewards/soft_format_reward/std": 0.4952339828014374,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4613.0,
      "completions/max_terminated_length": 4613.0,
      "completions/mean_length": 597.8828125,
      "completions/mean_terminated_length": 597.8828125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.07817271283742519,
      "grad_norm": 0.7429177868961824,
      "learning_rate": 7.5e-07,
      "loss": 0.0252,
      "num_tokens": 15389594.0,
      "reward": 0.40625,
      "reward_std": 0.3103194832801819,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.6953125,
      "rewards/soft_format_reward/std": 0.4607250988483429,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5187.0,
      "completions/max_terminated_length": 5187.0,
      "completions/mean_length": 574.287109375,
      "completions/mean_terminated_length": 574.287109375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.08012703065836081,
      "grad_norm": 29.45387609226835,
      "learning_rate": 7.692307692307693e-07,
      "loss": 0.0312,
      "num_tokens": 15757581.0,
      "reward": 0.435546875,
      "reward_std": 0.2659274637699127,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.76953125,
      "rewards/soft_format_reward/std": 0.42154473066329956,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2535.0,
      "completions/max_terminated_length": 2535.0,
      "completions/mean_length": 541.6171875,
      "completions/mean_terminated_length": 541.6171875,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.08208134847929645,
      "grad_norm": 13.134675337038862,
      "learning_rate": 7.884615384615384e-07,
      "loss": 0.0486,
      "num_tokens": 16107305.0,
      "reward": 0.48046875,
      "reward_std": 0.26097944378852844,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.8125,
      "rewards/soft_format_reward/std": 0.39069411158561707,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6681.0,
      "completions/max_terminated_length": 6681.0,
      "completions/mean_length": 602.7421875,
      "completions/mean_terminated_length": 602.7421875,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.08403566630023207,
      "grad_norm": 1.3047464546026393,
      "learning_rate": 8.076923076923077e-07,
      "loss": 0.0358,
      "num_tokens": 16486613.0,
      "reward": 0.453125,
      "reward_std": 0.23063214123249054,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.8359375,
      "rewards/soft_format_reward/std": 0.37069445848464966,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6393.0,
      "completions/max_terminated_length": 6393.0,
      "completions/mean_length": 585.7734375,
      "completions/mean_terminated_length": 585.7734375,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.0859899841211677,
      "grad_norm": 1.0830243597441251,
      "learning_rate": 8.269230769230768e-07,
      "loss": 0.0811,
      "num_tokens": 16853809.0,
      "reward": 0.470703125,
      "reward_std": 0.1985776722431183,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.86328125,
      "rewards/soft_format_reward/std": 0.3438861668109894,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7895.0,
      "completions/max_terminated_length": 7895.0,
      "completions/mean_length": 609.8515625,
      "completions/mean_terminated_length": 609.8515625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.08794430194210333,
      "grad_norm": 0.6775285892968461,
      "learning_rate": 8.461538461538461e-07,
      "loss": 0.0693,
      "num_tokens": 17235989.0,
      "reward": 0.4794921875,
      "reward_std": 0.20187821984291077,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.900390625,
      "rewards/soft_format_reward/std": 0.29977133870124817,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4606.0,
      "completions/max_terminated_length": 4606.0,
      "completions/mean_length": 584.8515625,
      "completions/mean_terminated_length": 584.8515625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.08989861976303896,
      "grad_norm": 1.1014489991571397,
      "learning_rate": 8.653846153846154e-07,
      "loss": 0.0795,
      "num_tokens": 17606137.0,
      "reward": 0.498046875,
      "reward_std": 0.2243758738040924,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.8984375,
      "rewards/soft_format_reward/std": 0.30236753821372986,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3147.0,
      "completions/max_terminated_length": 3147.0,
      "completions/mean_length": 544.201171875,
      "completions/mean_terminated_length": 544.201171875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.0918529375839746,
      "grad_norm": 0.574106279906047,
      "learning_rate": 8.846153846153846e-07,
      "loss": 0.0314,
      "num_tokens": 17956080.0,
      "reward": 0.533203125,
      "reward_std": 0.22982193529605865,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.91015625,
      "rewards/soft_format_reward/std": 0.2862374484539032,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 5575.0,
      "completions/max_terminated_length": 5575.0,
      "completions/mean_length": 532.357421875,
      "completions/mean_terminated_length": 533.3992309570312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.09380725540491022,
      "grad_norm": 0.6550073360929063,
      "learning_rate": 9.038461538461538e-07,
      "loss": 0.0482,
      "num_tokens": 18319511.0,
      "reward": 0.5029296875,
      "reward_std": 0.19548678398132324,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.927734375,
      "rewards/soft_format_reward/std": 0.2591804563999176,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4345.0,
      "completions/max_terminated_length": 4345.0,
      "completions/mean_length": 549.814453125,
      "completions/mean_terminated_length": 549.814453125,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.09576157322584586,
      "grad_norm": 0.5803676816838714,
      "learning_rate": 9.230769230769231e-07,
      "loss": 0.0097,
      "num_tokens": 18672504.0,
      "reward": 0.537109375,
      "reward_std": 0.15639463067054749,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.95703125,
      "rewards/soft_format_reward/std": 0.2029850035905838,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5540.0,
      "completions/max_terminated_length": 5540.0,
      "completions/mean_length": 479.1484375,
      "completions/mean_terminated_length": 479.1484375,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.09771589104678148,
      "grad_norm": 0.481600628947521,
      "learning_rate": 9.423076923076923e-07,
      "loss": 0.0072,
      "num_tokens": 18988532.0,
      "reward": 0.5673828125,
      "reward_std": 0.19646382331848145,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.962890625,
      "rewards/soft_format_reward/std": 0.18921469151973724,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3445.0,
      "completions/max_terminated_length": 3445.0,
      "completions/mean_length": 571.2265625,
      "completions/mean_terminated_length": 571.2265625,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.09967020886771712,
      "grad_norm": 0.4012053390386752,
      "learning_rate": 9.615384615384615e-07,
      "loss": 0.0531,
      "num_tokens": 19359016.0,
      "reward": 0.486328125,
      "reward_std": 0.08090324699878693,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9609375,
      "rewards/soft_format_reward/std": 0.1939331740140915,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2326.0,
      "completions/max_terminated_length": 2326.0,
      "completions/mean_length": 550.65234375,
      "completions/mean_terminated_length": 550.65234375,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.10162452668865274,
      "grad_norm": 0.5384873921400348,
      "learning_rate": 9.807692307692306e-07,
      "loss": 0.0421,
      "num_tokens": 19731990.0,
      "reward": 0.51953125,
      "reward_std": 0.1248648464679718,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.96875,
      "rewards/soft_format_reward/std": 0.17416280508041382,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 1958.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 499.494140625,
      "completions/mean_terminated_length": 500.47161865234375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.10357884450958837,
      "grad_norm": 0.498222527692459,
      "learning_rate": 1e-06,
      "loss": 0.018,
      "num_tokens": 20062755.0,
      "reward": 0.5810546875,
      "reward_std": 0.19981679320335388,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.962890625,
      "rewards/soft_format_reward/std": 0.18921469151973724,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2145.0,
      "completions/max_terminated_length": 2145.0,
      "completions/mean_length": 492.4609375,
      "completions/mean_terminated_length": 492.4609375,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.105533162330524,
      "grad_norm": 0.5552275571581469,
      "learning_rate": 1e-06,
      "loss": 0.0138,
      "num_tokens": 20397039.0,
      "reward": 0.583984375,
      "reward_std": 0.1560341715812683,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98828125,
      "rewards/soft_format_reward/std": 0.10772226005792618,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2139.0,
      "completions/max_terminated_length": 2139.0,
      "completions/mean_length": 564.458984375,
      "completions/mean_terminated_length": 564.458984375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.10748748015145963,
      "grad_norm": 0.2624076614092068,
      "learning_rate": 1e-06,
      "loss": -0.0092,
      "num_tokens": 20761898.0,
      "reward": 0.52734375,
      "reward_std": 0.06822281330823898,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98828125,
      "rewards/soft_format_reward/std": 0.10772226005792618,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2003.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 522.08203125,
      "completions/mean_terminated_length": 522.08203125,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.10944179797239525,
      "grad_norm": 0.19130326711252885,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 21108452.0,
      "reward": 0.5283203125,
      "reward_std": 0.06760530173778534,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1655.0,
      "completions/max_terminated_length": 1655.0,
      "completions/mean_length": 502.4375,
      "completions/mean_terminated_length": 502.4375,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.11139611579333089,
      "grad_norm": 0.2850231203751861,
      "learning_rate": 1e-06,
      "loss": -0.0068,
      "num_tokens": 21448228.0,
      "reward": 0.529296875,
      "reward_std": 0.06024399399757385,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5151.0,
      "completions/max_terminated_length": 5151.0,
      "completions/mean_length": 564.33203125,
      "completions/mean_terminated_length": 564.33203125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.11335043361426653,
      "grad_norm": 0.3203274608081484,
      "learning_rate": 1e-06,
      "loss": 0.0214,
      "num_tokens": 21823918.0,
      "reward": 0.509765625,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3359.0,
      "completions/max_terminated_length": 3359.0,
      "completions/mean_length": 575.369140625,
      "completions/mean_terminated_length": 575.369140625,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.11530475143520215,
      "grad_norm": 0.22055660729223742,
      "learning_rate": 1e-06,
      "loss": 0.0264,
      "num_tokens": 22203387.0,
      "reward": 0.5107421875,
      "reward_std": 0.043188292533159256,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2076.0,
      "completions/max_terminated_length": 2076.0,
      "completions/mean_length": 536.74609375,
      "completions/mean_terminated_length": 536.74609375,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.11725906925613779,
      "grad_norm": 0.872705985489336,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 22553513.0,
      "reward": 0.51171875,
      "reward_std": 0.046875,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1573.0,
      "completions/max_terminated_length": 1573.0,
      "completions/mean_length": 557.34375,
      "completions/mean_terminated_length": 557.34375,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.11921338707707341,
      "grad_norm": 0.39900505592881763,
      "learning_rate": 1e-06,
      "loss": -0.005,
      "num_tokens": 22951465.0,
      "reward": 0.5546875,
      "reward_std": 0.11014671623706818,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1769.0,
      "completions/max_terminated_length": 1769.0,
      "completions/mean_length": 543.732421875,
      "completions/mean_terminated_length": 543.732421875,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.12116770489800904,
      "grad_norm": 0.5615740045847875,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 23304432.0,
      "reward": 0.556640625,
      "reward_std": 0.1440330445766449,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1900.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 509.953125,
      "completions/mean_terminated_length": 509.953125,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.12312202271894467,
      "grad_norm": 0.501055796908243,
      "learning_rate": 1e-06,
      "loss": 0.0046,
      "num_tokens": 23625384.0,
      "reward": 0.544921875,
      "reward_std": 0.07823248207569122,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2610.0,
      "completions/max_terminated_length": 2610.0,
      "completions/mean_length": 520.38671875,
      "completions/mean_terminated_length": 520.38671875,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.1250763405398803,
      "grad_norm": 0.6171900301640806,
      "learning_rate": 1e-06,
      "loss": 0.0256,
      "num_tokens": 23960798.0,
      "reward": 0.5634765625,
      "reward_std": 0.11427251994609833,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3039.0,
      "completions/max_terminated_length": 3039.0,
      "completions/mean_length": 545.3046875,
      "completions/mean_terminated_length": 545.3046875,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.12703065836081592,
      "grad_norm": 0.3948540084280699,
      "learning_rate": 1e-06,
      "loss": -0.004,
      "num_tokens": 24339754.0,
      "reward": 0.5498046875,
      "reward_std": 0.11211925745010376,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1775.0,
      "completions/max_terminated_length": 1775.0,
      "completions/mean_length": 494.08203125,
      "completions/mean_terminated_length": 494.08203125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.12898497618175156,
      "grad_norm": 0.3304401976843138,
      "learning_rate": 1e-06,
      "loss": -0.0111,
      "num_tokens": 24677252.0,
      "reward": 0.572265625,
      "reward_std": 0.13781127333641052,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2882.0,
      "completions/max_terminated_length": 2882.0,
      "completions/mean_length": 541.107421875,
      "completions/mean_terminated_length": 541.107421875,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.1309392940026872,
      "grad_norm": 0.24461956142234229,
      "learning_rate": 1e-06,
      "loss": -0.0105,
      "num_tokens": 25029467.0,
      "reward": 0.5205078125,
      "reward_std": 0.060458000749349594,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2125.0,
      "completions/max_terminated_length": 2125.0,
      "completions/mean_length": 536.458984375,
      "completions/mean_terminated_length": 536.458984375,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.1328936118236228,
      "grad_norm": 0.18539220344454985,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 25386966.0,
      "reward": 0.5146484375,
      "reward_std": 0.04230354726314545,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1245.0,
      "completions/max_terminated_length": 1245.0,
      "completions/mean_length": 530.16015625,
      "completions/mean_terminated_length": 530.16015625,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.13484792964455844,
      "grad_norm": 0.25468225730494143,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 25738152.0,
      "reward": 0.517578125,
      "reward_std": 0.043135739862918854,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1268.0,
      "completions/max_terminated_length": 1268.0,
      "completions/mean_length": 508.466796875,
      "completions/mean_terminated_length": 508.466796875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 0.13680224746549408,
      "grad_norm": 0.20022401436907783,
      "learning_rate": 1e-06,
      "loss": -0.0044,
      "num_tokens": 26060535.0,
      "reward": 0.513671875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1671.0,
      "completions/max_terminated_length": 1671.0,
      "completions/mean_length": 537.802734375,
      "completions/mean_terminated_length": 537.802734375,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.1387565652864297,
      "grad_norm": 0.2668852184808627,
      "learning_rate": 1e-06,
      "loss": -0.0031,
      "num_tokens": 26416818.0,
      "reward": 0.5390625,
      "reward_std": 0.07992979884147644,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4278.0,
      "completions/max_terminated_length": 4278.0,
      "completions/mean_length": 562.83984375,
      "completions/mean_terminated_length": 562.83984375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.14071088310736535,
      "grad_norm": 0.2304930419413057,
      "learning_rate": 1e-06,
      "loss": -0.0037,
      "num_tokens": 26772208.0,
      "reward": 0.5322265625,
      "reward_std": 0.0869225412607193,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2317.0,
      "completions/max_terminated_length": 2317.0,
      "completions/mean_length": 594.578125,
      "completions/mean_terminated_length": 594.578125,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.14266520092830096,
      "grad_norm": 0.22724559534599176,
      "learning_rate": 1e-06,
      "loss": -0.0032,
      "num_tokens": 27160920.0,
      "reward": 0.53125,
      "reward_std": 0.07702205330133438,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2284.0,
      "completions/max_terminated_length": 2284.0,
      "completions/mean_length": 578.845703125,
      "completions/mean_terminated_length": 578.845703125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.1446195187492366,
      "grad_norm": 0.22287386551482907,
      "learning_rate": 1e-06,
      "loss": -0.006,
      "num_tokens": 27542313.0,
      "reward": 0.5166015625,
      "reward_std": 0.051993079483509064,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1372.0,
      "completions/max_terminated_length": 1372.0,
      "completions/mean_length": 613.80859375,
      "completions/mean_terminated_length": 613.80859375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.14657383657017223,
      "grad_norm": 0.18029464401038803,
      "learning_rate": 1e-06,
      "loss": -0.0038,
      "num_tokens": 27934759.0,
      "reward": 0.5244140625,
      "reward_std": 0.048324182629585266,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3789.0,
      "completions/max_terminated_length": 3789.0,
      "completions/mean_length": 580.494140625,
      "completions/mean_terminated_length": 580.494140625,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.14852815439110786,
      "grad_norm": 0.23367405275494105,
      "learning_rate": 1e-06,
      "loss": 0.0071,
      "num_tokens": 28304788.0,
      "reward": 0.5107421875,
      "reward_std": 0.04472580552101135,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2703.0,
      "completions/max_terminated_length": 2703.0,
      "completions/mean_length": 662.54296875,
      "completions/mean_terminated_length": 662.54296875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.15048247221204347,
      "grad_norm": 0.15822988644136107,
      "learning_rate": 1e-06,
      "loss": 0.0007,
      "num_tokens": 28722346.0,
      "reward": 0.5166015625,
      "reward_std": 0.047041989862918854,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1879.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 651.3359375,
      "completions/mean_terminated_length": 651.3359375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.1524367900329791,
      "grad_norm": 0.1356967673128265,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 29145574.0,
      "reward": 0.5078125,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1352.0,
      "completions/max_terminated_length": 1352.0,
      "completions/mean_length": 649.826171875,
      "completions/mean_terminated_length": 649.826171875,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "epoch": 0.15439110785391474,
      "grad_norm": 0.23821706835435685,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 29575085.0,
      "reward": 0.5078125,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4816.0,
      "completions/max_terminated_length": 4816.0,
      "completions/mean_length": 647.208984375,
      "completions/mean_terminated_length": 647.208984375,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.15634542567485038,
      "grad_norm": 0.20539071002520867,
      "learning_rate": 1e-06,
      "loss": 0.001,
      "num_tokens": 29972664.0,
      "reward": 0.5126953125,
      "reward_std": 0.05446862801909447,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2282.0,
      "completions/max_terminated_length": 2282.0,
      "completions/mean_length": 580.908203125,
      "completions/mean_terminated_length": 580.908203125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.158299743495786,
      "grad_norm": 0.38231227027935133,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 30353817.0,
      "reward": 0.5732421875,
      "reward_std": 0.14383290708065033,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3727.0,
      "completions/max_terminated_length": 3727.0,
      "completions/mean_length": 590.91796875,
      "completions/mean_terminated_length": 590.91796875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.16025406131672162,
      "grad_norm": 0.2840205979252729,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 30747375.0,
      "reward": 0.6103515625,
      "reward_std": 0.1419982761144638,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2953.0,
      "completions/max_terminated_length": 2953.0,
      "completions/mean_length": 650.36328125,
      "completions/mean_terminated_length": 650.36328125,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "epoch": 0.16220837913765726,
      "grad_norm": 0.26607531904033477,
      "learning_rate": 1e-06,
      "loss": 0.019,
      "num_tokens": 31164505.0,
      "reward": 0.685546875,
      "reward_std": 0.18039385974407196,
      "rewards/accuracy_reward/mean": 0.1875,
      "rewards/accuracy_reward/std": 0.39069411158561707,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1547.0,
      "completions/max_terminated_length": 1547.0,
      "completions/mean_length": 627.5546875,
      "completions/mean_terminated_length": 627.5546875,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.1641626969585929,
      "grad_norm": 0.4799261235792459,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 31566789.0,
      "reward": 0.58984375,
      "reward_std": 0.15521381795406342,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2090.0,
      "completions/max_terminated_length": 2090.0,
      "completions/mean_length": 589.341796875,
      "completions/mean_terminated_length": 589.341796875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.16611701477952853,
      "grad_norm": 0.35292376862513175,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 31952884.0,
      "reward": 0.630859375,
      "reward_std": 0.15488673746585846,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5012.0,
      "completions/max_terminated_length": 5012.0,
      "completions/mean_length": 627.216796875,
      "completions/mean_terminated_length": 627.216796875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.16807133260046414,
      "grad_norm": 0.2808525258157565,
      "learning_rate": 1e-06,
      "loss": 0.0234,
      "num_tokens": 32358211.0,
      "reward": 0.6005859375,
      "reward_std": 0.11676256358623505,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5346.0,
      "completions/max_terminated_length": 5346.0,
      "completions/mean_length": 600.85546875,
      "completions/mean_terminated_length": 600.85546875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.17002565042139978,
      "grad_norm": 1.2788367609840963,
      "learning_rate": 1e-06,
      "loss": 0.0332,
      "num_tokens": 32762441.0,
      "reward": 0.5771484375,
      "reward_std": 0.15553121268749237,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4114.0,
      "completions/max_terminated_length": 4114.0,
      "completions/mean_length": 583.478515625,
      "completions/mean_terminated_length": 583.478515625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.1719799682423354,
      "grad_norm": 0.8381002401118538,
      "learning_rate": 1e-06,
      "loss": 0.0158,
      "num_tokens": 33153966.0,
      "reward": 0.5673828125,
      "reward_std": 0.1311238408088684,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2656.0,
      "completions/max_terminated_length": 2656.0,
      "completions/mean_length": 561.716796875,
      "completions/mean_terminated_length": 561.716796875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.17393428606327105,
      "grad_norm": 1.2959332169759497,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 33502845.0,
      "reward": 0.5556640625,
      "reward_std": 0.11637574434280396,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4902.0,
      "completions/max_terminated_length": 4902.0,
      "completions/mean_length": 671.85546875,
      "completions/mean_terminated_length": 671.85546875,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.17588860388420666,
      "grad_norm": 0.3142154309040994,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 33926915.0,
      "reward": 0.5625,
      "reward_std": 0.12823474407196045,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3444.0,
      "completions/max_terminated_length": 3444.0,
      "completions/mean_length": 608.5546875,
      "completions/mean_terminated_length": 608.5546875,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.1778429217051423,
      "grad_norm": 0.3600101015470794,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 34302927.0,
      "reward": 0.5361328125,
      "reward_std": 0.09729446470737457,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3501.0,
      "completions/max_terminated_length": 3501.0,
      "completions/mean_length": 619.130859375,
      "completions/mean_terminated_length": 619.130859375,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.17979723952607793,
      "grad_norm": 0.37543906173547886,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 34686082.0,
      "reward": 0.5576171875,
      "reward_std": 0.10678014904260635,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3983.0,
      "completions/max_terminated_length": 3983.0,
      "completions/mean_length": 631.79296875,
      "completions/mean_terminated_length": 631.79296875,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.18175155734701356,
      "grad_norm": 0.34367370480737347,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 35081752.0,
      "reward": 0.5400390625,
      "reward_std": 0.08358919620513916,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3042.0,
      "completions/max_terminated_length": 3042.0,
      "completions/mean_length": 641.7734375,
      "completions/mean_terminated_length": 641.7734375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.1837058751679492,
      "grad_norm": 0.16839045296922697,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 35492212.0,
      "reward": 0.521484375,
      "reward_std": 0.05099457502365112,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4073.0,
      "completions/max_terminated_length": 4073.0,
      "completions/mean_length": 653.001953125,
      "completions/mean_terminated_length": 653.001953125,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.1856601929888848,
      "grad_norm": 0.26168912129631455,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 35894165.0,
      "reward": 0.5625,
      "reward_std": 0.12186098843812943,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2100.0,
      "completions/max_terminated_length": 2100.0,
      "completions/mean_length": 558.08203125,
      "completions/mean_terminated_length": 558.08203125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.18761451080982045,
      "grad_norm": 0.2948149032225118,
      "learning_rate": 1e-06,
      "loss": 0.001,
      "num_tokens": 36251375.0,
      "reward": 0.5322265625,
      "reward_std": 0.06337852776050568,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1425.0,
      "completions/max_terminated_length": 1425.0,
      "completions/mean_length": 576.83984375,
      "completions/mean_terminated_length": 576.83984375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.18956882863075608,
      "grad_norm": 0.24383757507524517,
      "learning_rate": 1e-06,
      "loss": -0.0086,
      "num_tokens": 36616573.0,
      "reward": 0.5625,
      "reward_std": 0.10904473811388016,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5066.0,
      "completions/max_terminated_length": 5066.0,
      "completions/mean_length": 659.486328125,
      "completions/mean_terminated_length": 659.486328125,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.19152314645169172,
      "grad_norm": 0.1457015429228493,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 37019510.0,
      "reward": 0.53515625,
      "reward_std": 0.05931950733065605,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3219.0,
      "completions/max_terminated_length": 3219.0,
      "completions/mean_length": 654.630859375,
      "completions/mean_terminated_length": 654.630859375,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.19347746427262733,
      "grad_norm": 0.32090870702118257,
      "learning_rate": 1e-06,
      "loss": 0.0044,
      "num_tokens": 37417945.0,
      "reward": 0.58984375,
      "reward_std": 0.15871897339820862,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4465.0,
      "completions/max_terminated_length": 4465.0,
      "completions/mean_length": 657.755859375,
      "completions/mean_terminated_length": 657.755859375,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.19543178209356296,
      "grad_norm": 0.20341807021185154,
      "learning_rate": 1e-06,
      "loss": 0.0169,
      "num_tokens": 37839564.0,
      "reward": 0.544921875,
      "reward_std": 0.08864613622426987,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2882.0,
      "completions/max_terminated_length": 2882.0,
      "completions/mean_length": 643.47265625,
      "completions/mean_terminated_length": 643.47265625,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.1973860999144986,
      "grad_norm": 0.2535460898682244,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 38255278.0,
      "reward": 0.52734375,
      "reward_std": 0.06777782738208771,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2238.0,
      "completions/max_terminated_length": 2238.0,
      "completions/mean_length": 660.02734375,
      "completions/mean_terminated_length": 660.02734375,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.19934041773543423,
      "grad_norm": 0.2392828254528082,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 38666508.0,
      "reward": 0.5888671875,
      "reward_std": 0.12386061251163483,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1597.0,
      "completions/max_terminated_length": 1597.0,
      "completions/mean_length": 695.3359375,
      "completions/mean_terminated_length": 695.3359375,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.20129473555636984,
      "grad_norm": 0.2827856598070154,
      "learning_rate": 1e-06,
      "loss": 0.0057,
      "num_tokens": 39105480.0,
      "reward": 0.59765625,
      "reward_std": 0.16757801175117493,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2671.0,
      "completions/max_terminated_length": 2671.0,
      "completions/mean_length": 693.658203125,
      "completions/mean_terminated_length": 693.658203125,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 0.20324905337730548,
      "grad_norm": 0.27075802048695746,
      "learning_rate": 1e-06,
      "loss": -0.0067,
      "num_tokens": 39541945.0,
      "reward": 0.640625,
      "reward_std": 0.20036140084266663,
      "rewards/accuracy_reward/mean": 0.140625,
      "rewards/accuracy_reward/std": 0.3479743003845215,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1863.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 646.435546875,
      "completions/mean_terminated_length": 646.435546875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.2052033711982411,
      "grad_norm": 0.27514293915783544,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 39951528.0,
      "reward": 0.568359375,
      "reward_std": 0.1401737928390503,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2086.0,
      "completions/max_terminated_length": 2086.0,
      "completions/mean_length": 719.32421875,
      "completions/mean_terminated_length": 719.32421875,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.20715768901917675,
      "grad_norm": 0.227750684574606,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 40405582.0,
      "reward": 0.580078125,
      "reward_std": 0.12753018736839294,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2637.0,
      "completions/max_terminated_length": 2637.0,
      "completions/mean_length": 691.31640625,
      "completions/mean_terminated_length": 691.31640625,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.20911200684011239,
      "grad_norm": 0.2632670115755812,
      "learning_rate": 1e-06,
      "loss": 0.0095,
      "num_tokens": 40849920.0,
      "reward": 0.591796875,
      "reward_std": 0.18596382439136505,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1475.0,
      "completions/max_terminated_length": 1475.0,
      "completions/mean_length": 687.16796875,
      "completions/mean_terminated_length": 687.16796875,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.211066324661048,
      "grad_norm": 0.2255648801347303,
      "learning_rate": 1e-06,
      "loss": 0.0052,
      "num_tokens": 41288214.0,
      "reward": 0.591796875,
      "reward_std": 0.1527780294418335,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1898.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 684.91796875,
      "completions/mean_terminated_length": 684.91796875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.21302064248198363,
      "grad_norm": 0.23856808511899583,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 41723484.0,
      "reward": 0.59375,
      "reward_std": 0.14524322748184204,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3242.0,
      "completions/max_terminated_length": 3242.0,
      "completions/mean_length": 741.443359375,
      "completions/mean_terminated_length": 741.443359375,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.21497496030291927,
      "grad_norm": 0.26469393080648274,
      "learning_rate": 1e-06,
      "loss": 0.011,
      "num_tokens": 42177983.0,
      "reward": 0.701171875,
      "reward_std": 0.22451630234718323,
      "rewards/accuracy_reward/mean": 0.203125,
      "rewards/accuracy_reward/std": 0.4027182459831238,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1848.0,
      "completions/max_terminated_length": 1848.0,
      "completions/mean_length": 706.369140625,
      "completions/mean_terminated_length": 706.369140625,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 0.2169292781238549,
      "grad_norm": 0.26485973865282497,
      "learning_rate": 1e-06,
      "loss": 0.005,
      "num_tokens": 42619340.0,
      "reward": 0.599609375,
      "reward_std": 0.1698872447013855,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1545.0,
      "completions/max_terminated_length": 1545.0,
      "completions/mean_length": 706.29296875,
      "completions/mean_terminated_length": 706.29296875,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.2188835959447905,
      "grad_norm": 0.2913404975770583,
      "learning_rate": 1e-06,
      "loss": -0.001,
      "num_tokens": 43058418.0,
      "reward": 0.640625,
      "reward_std": 0.19215244054794312,
      "rewards/accuracy_reward/mean": 0.140625,
      "rewards/accuracy_reward/std": 0.3479743003845215,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1465.0,
      "completions/max_terminated_length": 1465.0,
      "completions/mean_length": 664.96875,
      "completions/mean_terminated_length": 664.96875,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.22083791376572615,
      "grad_norm": 0.15839005706172635,
      "learning_rate": 1e-06,
      "loss": -0.0026,
      "num_tokens": 43480722.0,
      "reward": 0.525390625,
      "reward_std": 0.06354551017284393,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2067.0,
      "completions/max_terminated_length": 2067.0,
      "completions/mean_length": 640.341796875,
      "completions/mean_terminated_length": 640.341796875,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.22279223158666178,
      "grad_norm": 0.2661420754364224,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 43889441.0,
      "reward": 0.580078125,
      "reward_std": 0.1470821499824524,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2283.0,
      "completions/max_terminated_length": 2283.0,
      "completions/mean_length": 682.2578125,
      "completions/mean_terminated_length": 682.2578125,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.22474654940759742,
      "grad_norm": 0.2609150499556646,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 44318037.0,
      "reward": 0.681640625,
      "reward_std": 0.2350630760192871,
      "rewards/accuracy_reward/mean": 0.181640625,
      "rewards/accuracy_reward/std": 0.38592514395713806,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4330.0,
      "completions/max_terminated_length": 4330.0,
      "completions/mean_length": 731.0078125,
      "completions/mean_terminated_length": 731.0078125,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.22670086722853305,
      "grad_norm": 0.24727089653306142,
      "learning_rate": 1e-06,
      "loss": 0.0035,
      "num_tokens": 44781513.0,
      "reward": 0.6123046875,
      "reward_std": 0.14934487640857697,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1518.0,
      "completions/max_terminated_length": 1518.0,
      "completions/mean_length": 725.41015625,
      "completions/mean_terminated_length": 725.41015625,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.22865518504946866,
      "grad_norm": 0.2556791325009635,
      "learning_rate": 1e-06,
      "loss": 0.0071,
      "num_tokens": 45231483.0,
      "reward": 0.6044921875,
      "reward_std": 0.15568572282791138,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2137.0,
      "completions/max_terminated_length": 2137.0,
      "completions/mean_length": 767.5703125,
      "completions/mean_terminated_length": 767.5703125,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.2306095028704043,
      "grad_norm": 0.2286798537485802,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "num_tokens": 45717935.0,
      "reward": 0.5673828125,
      "reward_std": 0.14105787873268127,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1265.0,
      "completions/max_terminated_length": 1265.0,
      "completions/mean_length": 707.685546875,
      "completions/mean_terminated_length": 707.685546875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.23256382069133993,
      "grad_norm": 0.23624585237603082,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 46166526.0,
      "reward": 0.607421875,
      "reward_std": 0.16086432337760925,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1570.0,
      "completions/max_terminated_length": 1570.0,
      "completions/mean_length": 638.669921875,
      "completions/mean_terminated_length": 638.669921875,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.23451813851227557,
      "grad_norm": 0.2660519743635799,
      "learning_rate": 1e-06,
      "loss": -0.0077,
      "num_tokens": 46572501.0,
      "reward": 0.5927734375,
      "reward_std": 0.13170786201953888,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3928.0,
      "completions/max_terminated_length": 3928.0,
      "completions/mean_length": 668.509765625,
      "completions/mean_terminated_length": 668.509765625,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.23647245633321118,
      "grad_norm": 0.27160253111795785,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 46990122.0,
      "reward": 0.5810546875,
      "reward_std": 0.10540895164012909,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2616.0,
      "completions/max_terminated_length": 2616.0,
      "completions/mean_length": 643.935546875,
      "completions/mean_terminated_length": 643.935546875,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.23842677415414681,
      "grad_norm": 0.2782906958717248,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 47412393.0,
      "reward": 0.599609375,
      "reward_std": 0.15294982492923737,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1719.0,
      "completions/max_terminated_length": 1719.0,
      "completions/mean_length": 656.884765625,
      "completions/mean_terminated_length": 656.884765625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.24038109197508245,
      "grad_norm": 0.2860229830755967,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 47830686.0,
      "reward": 0.634765625,
      "reward_std": 0.18557101488113403,
      "rewards/accuracy_reward/mean": 0.134765625,
      "rewards/accuracy_reward/std": 0.3418070077896118,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1621.0,
      "completions/max_terminated_length": 1621.0,
      "completions/mean_length": 699.51953125,
      "completions/mean_terminated_length": 699.51953125,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.2423354097960181,
      "grad_norm": 0.26407286026259613,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 48263288.0,
      "reward": 0.626953125,
      "reward_std": 0.20566867291927338,
      "rewards/accuracy_reward/mean": 0.126953125,
      "rewards/accuracy_reward/std": 0.33324605226516724,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1784.0,
      "completions/max_terminated_length": 1784.0,
      "completions/mean_length": 692.984375,
      "completions/mean_terminated_length": 692.984375,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.2442897276169537,
      "grad_norm": 0.24579712120050667,
      "learning_rate": 1e-06,
      "loss": 0.0078,
      "num_tokens": 48693200.0,
      "reward": 0.576171875,
      "reward_std": 0.16433526575565338,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4677.0,
      "completions/max_terminated_length": 4677.0,
      "completions/mean_length": 724.22265625,
      "completions/mean_terminated_length": 724.22265625,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.24624404543788933,
      "grad_norm": 0.27750163663134275,
      "learning_rate": 1e-06,
      "loss": 0.0148,
      "num_tokens": 49137042.0,
      "reward": 0.591796875,
      "reward_std": 0.16855724155902863,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4488.0,
      "completions/max_terminated_length": 4488.0,
      "completions/mean_length": 687.873046875,
      "completions/mean_terminated_length": 687.873046875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "epoch": 0.24819836325882497,
      "grad_norm": 0.326471684770648,
      "learning_rate": 1e-06,
      "loss": 0.007,
      "num_tokens": 49558257.0,
      "reward": 0.650390625,
      "reward_std": 0.2361946851015091,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2412.0,
      "completions/max_terminated_length": 2412.0,
      "completions/mean_length": 722.185546875,
      "completions/mean_terminated_length": 722.185546875,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.2501526810797606,
      "grad_norm": 0.2945388229120416,
      "learning_rate": 1e-06,
      "loss": -0.0015,
      "num_tokens": 49996592.0,
      "reward": 0.623046875,
      "reward_std": 0.1910872906446457,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4334.0,
      "completions/max_terminated_length": 4334.0,
      "completions/mean_length": 736.427734375,
      "completions/mean_terminated_length": 736.427734375,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "epoch": 0.2521069989006962,
      "grad_norm": 0.27809340109025193,
      "learning_rate": 1e-06,
      "loss": 0.0175,
      "num_tokens": 50447643.0,
      "reward": 0.66796875,
      "reward_std": 0.20906388759613037,
      "rewards/accuracy_reward/mean": 0.169921875,
      "rewards/accuracy_reward/std": 0.3759314715862274,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4486.0,
      "completions/max_terminated_length": 4486.0,
      "completions/mean_length": 697.94140625,
      "completions/mean_terminated_length": 697.94140625,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.25406131672163185,
      "grad_norm": 0.35023314790448384,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 50880269.0,
      "reward": 0.6220703125,
      "reward_std": 0.23224018514156342,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2300.0,
      "completions/max_terminated_length": 2300.0,
      "completions/mean_length": 688.73828125,
      "completions/mean_terminated_length": 688.73828125,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.2560156345425675,
      "grad_norm": 0.2927254570333304,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 51313479.0,
      "reward": 0.568359375,
      "reward_std": 0.1609431505203247,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4300.0,
      "completions/max_terminated_length": 4300.0,
      "completions/mean_length": 718.095703125,
      "completions/mean_terminated_length": 718.095703125,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.2579699523635031,
      "grad_norm": 0.48962046110997975,
      "learning_rate": 1e-06,
      "loss": 0.01,
      "num_tokens": 51744200.0,
      "reward": 0.6240234375,
      "reward_std": 0.23233550786972046,
      "rewards/accuracy_reward/mean": 0.126953125,
      "rewards/accuracy_reward/std": 0.33324605226516724,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4426.0,
      "completions/max_terminated_length": 4426.0,
      "completions/mean_length": 681.51953125,
      "completions/mean_terminated_length": 681.51953125,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "epoch": 0.25992427018443875,
      "grad_norm": 0.2766999722832949,
      "learning_rate": 1e-06,
      "loss": -0.0036,
      "num_tokens": 52156018.0,
      "reward": 0.6220703125,
      "reward_std": 0.22395655512809753,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2029.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 741.916015625,
      "completions/mean_terminated_length": 741.916015625,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.2618785880053744,
      "grad_norm": 0.43387079617293317,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 52618487.0,
      "reward": 0.677734375,
      "reward_std": 0.23530298471450806,
      "rewards/accuracy_reward/mean": 0.177734375,
      "rewards/accuracy_reward/std": 0.3826628625392914,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4471.0,
      "completions/max_terminated_length": 4471.0,
      "completions/mean_length": 744.962890625,
      "completions/mean_terminated_length": 744.962890625,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.26383290582631,
      "grad_norm": 1.3507532705943417,
      "learning_rate": 1e-06,
      "loss": 0.0209,
      "num_tokens": 53062372.0,
      "reward": 0.5986328125,
      "reward_std": 0.2160569280385971,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2466.0,
      "completions/max_terminated_length": 2466.0,
      "completions/mean_length": 608.060546875,
      "completions/mean_terminated_length": 608.060546875,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.2657872236472456,
      "grad_norm": 0.6982954043889902,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 53440003.0,
      "reward": 0.681640625,
      "reward_std": 0.2599703073501587,
      "rewards/accuracy_reward/mean": 0.181640625,
      "rewards/accuracy_reward/std": 0.38592514395713806,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3369.0,
      "completions/max_terminated_length": 3369.0,
      "completions/mean_length": 623.02734375,
      "completions/mean_terminated_length": 623.02734375,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "epoch": 0.26774154146818124,
      "grad_norm": 0.40073052595287706,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 53821105.0,
      "reward": 0.7041015625,
      "reward_std": 0.29975640773773193,
      "rewards/accuracy_reward/mean": 0.205078125,
      "rewards/accuracy_reward/std": 0.4041535556316376,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1811.0,
      "completions/max_terminated_length": 1811.0,
      "completions/mean_length": 683.2265625,
      "completions/mean_terminated_length": 683.2265625,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.2696958592891169,
      "grad_norm": 0.3030102675188629,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 54239397.0,
      "reward": 0.611328125,
      "reward_std": 0.20015983283519745,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1362.0,
      "completions/max_terminated_length": 1362.0,
      "completions/mean_length": 587.904296875,
      "completions/mean_terminated_length": 587.904296875,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.2716501771100525,
      "grad_norm": 0.3629849326417865,
      "learning_rate": 1e-06,
      "loss": -0.0025,
      "num_tokens": 54601220.0,
      "reward": 0.71484375,
      "reward_std": 0.25802797079086304,
      "rewards/accuracy_reward/mean": 0.21484375,
      "rewards/accuracy_reward/std": 0.4111155867576599,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1554.0,
      "completions/max_terminated_length": 1554.0,
      "completions/mean_length": 633.720703125,
      "completions/mean_terminated_length": 633.720703125,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.27360449493098815,
      "grad_norm": 0.34569759966181524,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 54989189.0,
      "reward": 0.640625,
      "reward_std": 0.23244047164916992,
      "rewards/accuracy_reward/mean": 0.140625,
      "rewards/accuracy_reward/std": 0.3479743003845215,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4287.0,
      "completions/max_terminated_length": 4287.0,
      "completions/mean_length": 739.5390625,
      "completions/mean_terminated_length": 739.5390625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.2755588127519238,
      "grad_norm": 0.28818740456046005,
      "learning_rate": 1e-06,
      "loss": 0.0135,
      "num_tokens": 55429081.0,
      "reward": 0.6943359375,
      "reward_std": 0.22556185722351074,
      "rewards/accuracy_reward/mean": 0.1953125,
      "rewards/accuracy_reward/std": 0.3968288004398346,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3252.0,
      "completions/max_terminated_length": 3252.0,
      "completions/mean_length": 691.375,
      "completions/mean_terminated_length": 691.375,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.2775131305728594,
      "grad_norm": 0.272315442375954,
      "learning_rate": 1e-06,
      "loss": -0.0083,
      "num_tokens": 55845545.0,
      "reward": 0.6103515625,
      "reward_std": 0.16950386762619019,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3698.0,
      "completions/max_terminated_length": 3698.0,
      "completions/mean_length": 713.0546875,
      "completions/mean_terminated_length": 713.0546875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.27946744839379506,
      "grad_norm": 0.2632114845336422,
      "learning_rate": 1e-06,
      "loss": 0.0174,
      "num_tokens": 56284789.0,
      "reward": 0.6494140625,
      "reward_std": 0.17788583040237427,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2070.0,
      "completions/max_terminated_length": 2070.0,
      "completions/mean_length": 679.162109375,
      "completions/mean_terminated_length": 679.162109375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.2814217662147307,
      "grad_norm": 0.29020462850988193,
      "learning_rate": 1e-06,
      "loss": -0.0039,
      "num_tokens": 56700440.0,
      "reward": 0.62890625,
      "reward_std": 0.11030054092407227,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2492.0,
      "completions/max_terminated_length": 2492.0,
      "completions/mean_length": 759.43359375,
      "completions/mean_terminated_length": 759.43359375,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.2833760840356663,
      "grad_norm": 0.17974918127783202,
      "learning_rate": 1e-06,
      "loss": 0.0082,
      "num_tokens": 57159014.0,
      "reward": 0.56640625,
      "reward_std": 0.12247820198535919,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2610.0,
      "completions/max_terminated_length": 2610.0,
      "completions/mean_length": 692.119140625,
      "completions/mean_terminated_length": 692.119140625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.2853304018566019,
      "grad_norm": 0.2431924521653994,
      "learning_rate": 1e-06,
      "loss": -0.005,
      "num_tokens": 57580803.0,
      "reward": 0.572265625,
      "reward_std": 0.14612169563770294,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4290.0,
      "completions/max_terminated_length": 4290.0,
      "completions/mean_length": 670.27734375,
      "completions/mean_terminated_length": 670.27734375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.28728471967753755,
      "grad_norm": 0.3488174165532187,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 57983521.0,
      "reward": 0.6708984375,
      "reward_std": 0.2069774866104126,
      "rewards/accuracy_reward/mean": 0.173828125,
      "rewards/accuracy_reward/std": 0.3793322443962097,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2480.0,
      "completions/max_terminated_length": 2480.0,
      "completions/mean_length": 742.1328125,
      "completions/mean_terminated_length": 742.1328125,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.2892390374984732,
      "grad_norm": 0.24031256742591312,
      "learning_rate": 1e-06,
      "loss": 0.0067,
      "num_tokens": 58428325.0,
      "reward": 0.638671875,
      "reward_std": 0.19146013259887695,
      "rewards/accuracy_reward/mean": 0.138671875,
      "rewards/accuracy_reward/std": 0.34594178199768066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2790.0,
      "completions/max_terminated_length": 2790.0,
      "completions/mean_length": 710.646484375,
      "completions/mean_terminated_length": 710.646484375,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.2911933553194088,
      "grad_norm": 0.2589578742432614,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 58863264.0,
      "reward": 0.689453125,
      "reward_std": 0.18030090630054474,
      "rewards/accuracy_reward/mean": 0.189453125,
      "rewards/accuracy_reward/std": 0.3922513723373413,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3338.0,
      "completions/max_terminated_length": 3338.0,
      "completions/mean_length": 689.119140625,
      "completions/mean_terminated_length": 689.119140625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "epoch": 0.29314767314034446,
      "grad_norm": 0.31023940245693943,
      "learning_rate": 1e-06,
      "loss": 0.0231,
      "num_tokens": 59282013.0,
      "reward": 0.6826171875,
      "reward_std": 0.19207939505577087,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4099.0,
      "completions/max_terminated_length": 4099.0,
      "completions/mean_length": 815.75,
      "completions/mean_terminated_length": 815.75,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.2951019909612801,
      "grad_norm": 0.24229591833709058,
      "learning_rate": 1e-06,
      "loss": 0.0128,
      "num_tokens": 59768989.0,
      "reward": 0.69921875,
      "reward_std": 0.2111576944589615,
      "rewards/accuracy_reward/mean": 0.203125,
      "rewards/accuracy_reward/std": 0.4027182459831238,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4529.0,
      "completions/max_terminated_length": 4529.0,
      "completions/mean_length": 770.59765625,
      "completions/mean_terminated_length": 770.59765625,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.29705630878221573,
      "grad_norm": 0.23041197635858718,
      "learning_rate": 1e-06,
      "loss": 0.0168,
      "num_tokens": 60246655.0,
      "reward": 0.669921875,
      "reward_std": 0.14793451130390167,
      "rewards/accuracy_reward/mean": 0.171875,
      "rewards/accuracy_reward/std": 0.3776407241821289,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3552.0,
      "completions/max_terminated_length": 3552.0,
      "completions/mean_length": 802.4453125,
      "completions/mean_terminated_length": 802.4453125,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.29901062660315136,
      "grad_norm": 0.23297876350421173,
      "learning_rate": 1e-06,
      "loss": 0.0186,
      "num_tokens": 60737779.0,
      "reward": 0.6806640625,
      "reward_std": 0.14777296781539917,
      "rewards/accuracy_reward/mean": 0.181640625,
      "rewards/accuracy_reward/std": 0.38592514395713806,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1844.0,
      "completions/max_terminated_length": 1844.0,
      "completions/mean_length": 758.3046875,
      "completions/mean_terminated_length": 758.3046875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.30096494442408694,
      "grad_norm": 0.26254209084339436,
      "learning_rate": 1e-06,
      "loss": -0.0081,
      "num_tokens": 61212575.0,
      "reward": 0.65234375,
      "reward_std": 0.17155036330223083,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1906.0,
      "completions/max_terminated_length": 1906.0,
      "completions/mean_length": 744.447265625,
      "completions/mean_terminated_length": 744.447265625,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.3029192622450226,
      "grad_norm": 0.20358568099423952,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 61662388.0,
      "reward": 0.56640625,
      "reward_std": 0.08109388500452042,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1819.0,
      "completions/max_terminated_length": 1819.0,
      "completions/mean_length": 811.734375,
      "completions/mean_terminated_length": 811.734375,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.3048735800659582,
      "grad_norm": 0.21664905219307753,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 62164380.0,
      "reward": 0.6484375,
      "reward_std": 0.1361752450466156,
      "rewards/accuracy_reward/mean": 0.1484375,
      "rewards/accuracy_reward/std": 0.35588082671165466,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2043.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 683.7109375,
      "completions/mean_terminated_length": 683.7109375,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.30682789788689385,
      "grad_norm": 0.37650908881308803,
      "learning_rate": 1e-06,
      "loss": 0.0199,
      "num_tokens": 62618920.0,
      "reward": 0.767578125,
      "reward_std": 0.2620534896850586,
      "rewards/accuracy_reward/mean": 0.267578125,
      "rewards/accuracy_reward/std": 0.4431293308734894,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2904.0,
      "completions/max_terminated_length": 2904.0,
      "completions/mean_length": 752.37890625,
      "completions/mean_terminated_length": 752.37890625,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "epoch": 0.3087822157078295,
      "grad_norm": 0.24518760312034588,
      "learning_rate": 1e-06,
      "loss": 0.0146,
      "num_tokens": 63077690.0,
      "reward": 0.66796875,
      "reward_std": 0.15783661603927612,
      "rewards/accuracy_reward/mean": 0.16796875,
      "rewards/accuracy_reward/std": 0.374204158782959,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1666.0,
      "completions/max_terminated_length": 1666.0,
      "completions/mean_length": 748.009765625,
      "completions/mean_terminated_length": 748.009765625,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 0.3107365335287651,
      "grad_norm": 0.20605205942354404,
      "learning_rate": 1e-06,
      "loss": -0.0096,
      "num_tokens": 63527839.0,
      "reward": 0.62109375,
      "reward_std": 0.17718049883842468,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2589.0,
      "completions/max_terminated_length": 2589.0,
      "completions/mean_length": 747.109375,
      "completions/mean_terminated_length": 747.109375,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.31269085134970076,
      "grad_norm": 0.2679095070290706,
      "learning_rate": 1e-06,
      "loss": 0.0067,
      "num_tokens": 63975655.0,
      "reward": 0.6484375,
      "reward_std": 0.24407950043678284,
      "rewards/accuracy_reward/mean": 0.1484375,
      "rewards/accuracy_reward/std": 0.35588082671165466,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1827.0,
      "completions/max_terminated_length": 1827.0,
      "completions/mean_length": 812.529296875,
      "completions/mean_terminated_length": 812.529296875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.3146451691706364,
      "grad_norm": 0.24193586431650785,
      "learning_rate": 1e-06,
      "loss": -0.0078,
      "num_tokens": 64471622.0,
      "reward": 0.6953125,
      "reward_std": 0.22490081191062927,
      "rewards/accuracy_reward/mean": 0.1953125,
      "rewards/accuracy_reward/std": 0.3968288004398346,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2423.0,
      "completions/max_terminated_length": 2423.0,
      "completions/mean_length": 854.40234375,
      "completions/mean_terminated_length": 854.40234375,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.316599486991572,
      "grad_norm": 0.2317553138327133,
      "learning_rate": 1e-06,
      "loss": -0.0058,
      "num_tokens": 64993460.0,
      "reward": 0.619140625,
      "reward_std": 0.16367799043655396,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 1634.0,
      "completions/max_terminated_length": 1634.0,
      "completions/mean_length": 777.09375,
      "completions/mean_terminated_length": 778.614501953125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.3185538048125076,
      "grad_norm": 0.24628709207337093,
      "learning_rate": 1e-06,
      "loss": -0.0019,
      "num_tokens": 65482452.0,
      "reward": 0.7392578125,
      "reward_std": 0.24229753017425537,
      "rewards/accuracy_reward/mean": 0.240234375,
      "rewards/accuracy_reward/std": 0.4276435375213623,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2015.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 813.26171875,
      "completions/mean_terminated_length": 813.26171875,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.32050812263344325,
      "grad_norm": 0.2619314014933818,
      "learning_rate": 1e-06,
      "loss": 0.0072,
      "num_tokens": 65968538.0,
      "reward": 0.658203125,
      "reward_std": 0.20367062091827393,
      "rewards/accuracy_reward/mean": 0.158203125,
      "rewards/accuracy_reward/std": 0.36528825759887695,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3080.0,
      "completions/max_terminated_length": 3080.0,
      "completions/mean_length": 774.814453125,
      "completions/mean_terminated_length": 774.814453125,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.3224624404543789,
      "grad_norm": 0.25309168218127176,
      "learning_rate": 1e-06,
      "loss": 0.0175,
      "num_tokens": 66451115.0,
      "reward": 0.6220703125,
      "reward_std": 0.16638079285621643,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4065.0,
      "completions/max_terminated_length": 4065.0,
      "completions/mean_length": 790.060546875,
      "completions/mean_terminated_length": 790.060546875,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.3244167582753145,
      "grad_norm": 0.2963651499207953,
      "learning_rate": 1e-06,
      "loss": -0.0118,
      "num_tokens": 66925770.0,
      "reward": 0.74609375,
      "reward_std": 0.30336713790893555,
      "rewards/accuracy_reward/mean": 0.248046875,
      "rewards/accuracy_reward/std": 0.4323015511035919,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2770.0,
      "completions/max_terminated_length": 2770.0,
      "completions/mean_length": 764.955078125,
      "completions/mean_terminated_length": 764.955078125,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.32637107609625016,
      "grad_norm": 0.3205316675698082,
      "learning_rate": 1e-06,
      "loss": -0.0049,
      "num_tokens": 67382307.0,
      "reward": 0.736328125,
      "reward_std": 0.35904234647750854,
      "rewards/accuracy_reward/mean": 0.23828125,
      "rewards/accuracy_reward/std": 0.42644867300987244,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3767.0,
      "completions/max_terminated_length": 3767.0,
      "completions/mean_length": 780.142578125,
      "completions/mean_terminated_length": 780.142578125,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.3283253939171858,
      "grad_norm": 0.31422935388243467,
      "learning_rate": 1e-06,
      "loss": 0.0152,
      "num_tokens": 67847356.0,
      "reward": 0.7294921875,
      "reward_std": 0.2578871250152588,
      "rewards/accuracy_reward/mean": 0.232421875,
      "rewards/accuracy_reward/std": 0.42278963327407837,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3635.0,
      "completions/max_terminated_length": 3635.0,
      "completions/mean_length": 789.384765625,
      "completions/mean_terminated_length": 789.384765625,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.33027971173812143,
      "grad_norm": 0.2763591871083539,
      "learning_rate": 1e-06,
      "loss": 0.0166,
      "num_tokens": 68315777.0,
      "reward": 0.6640625,
      "reward_std": 0.2283334732055664,
      "rewards/accuracy_reward/mean": 0.166015625,
      "rewards/accuracy_reward/std": 0.3724585771560669,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3003.0,
      "completions/max_terminated_length": 3003.0,
      "completions/mean_length": 746.04296875,
      "completions/mean_terminated_length": 746.04296875,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.33223402955905706,
      "grad_norm": 0.22889079270457433,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 68784375.0,
      "reward": 0.6220703125,
      "reward_std": 0.15298479795455933,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2324.0,
      "completions/max_terminated_length": 2324.0,
      "completions/mean_length": 740.236328125,
      "completions/mean_terminated_length": 740.236328125,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.33418834737999265,
      "grad_norm": 0.2400226594585389,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 69230560.0,
      "reward": 0.623046875,
      "reward_std": 0.13837136328220367,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2893.0,
      "completions/max_terminated_length": 2893.0,
      "completions/mean_length": 798.46484375,
      "completions/mean_terminated_length": 798.46484375,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 0.3361426652009283,
      "grad_norm": 0.21283379297888078,
      "learning_rate": 1e-06,
      "loss": 0.0069,
      "num_tokens": 69712606.0,
      "reward": 0.6533203125,
      "reward_std": 0.13678552210330963,
      "rewards/accuracy_reward/mean": 0.154296875,
      "rewards/accuracy_reward/std": 0.36158639192581177,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2525.0,
      "completions/max_terminated_length": 2525.0,
      "completions/mean_length": 781.80859375,
      "completions/mean_terminated_length": 781.80859375,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.3380969830218639,
      "grad_norm": 0.24719869366609729,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 70180108.0,
      "reward": 0.650390625,
      "reward_std": 0.2241959273815155,
      "rewards/accuracy_reward/mean": 0.150390625,
      "rewards/accuracy_reward/std": 0.35780346393585205,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4074.0,
      "completions/max_terminated_length": 4074.0,
      "completions/mean_length": 748.6640625,
      "completions/mean_terminated_length": 748.6640625,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.34005130084279955,
      "grad_norm": 0.25610581415426187,
      "learning_rate": 1e-06,
      "loss": 0.0139,
      "num_tokens": 70632432.0,
      "reward": 0.634765625,
      "reward_std": 0.16960762441158295,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3302.0,
      "completions/max_terminated_length": 3302.0,
      "completions/mean_length": 745.935546875,
      "completions/mean_terminated_length": 745.935546875,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 0.3420056186637352,
      "grad_norm": 0.26869305689085277,
      "learning_rate": 1e-06,
      "loss": -0.0012,
      "num_tokens": 71082911.0,
      "reward": 0.646484375,
      "reward_std": 0.1600971221923828,
      "rewards/accuracy_reward/mean": 0.1484375,
      "rewards/accuracy_reward/std": 0.35588082671165466,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1497.0,
      "completions/max_terminated_length": 1497.0,
      "completions/mean_length": 757.15625,
      "completions/mean_terminated_length": 757.15625,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.3439599364846708,
      "grad_norm": 0.25912471509203233,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 71547023.0,
      "reward": 0.685546875,
      "reward_std": 0.18943729996681213,
      "rewards/accuracy_reward/mean": 0.185546875,
      "rewards/accuracy_reward/std": 0.38912075757980347,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4423.0,
      "completions/max_terminated_length": 4423.0,
      "completions/mean_length": 845.38671875,
      "completions/mean_terminated_length": 845.38671875,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.34591425430560646,
      "grad_norm": 0.27027972145066304,
      "learning_rate": 1e-06,
      "loss": 0.0096,
      "num_tokens": 72061349.0,
      "reward": 0.6474609375,
      "reward_std": 0.21319273114204407,
      "rewards/accuracy_reward/mean": 0.150390625,
      "rewards/accuracy_reward/std": 0.35780346393585205,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2395.0,
      "completions/max_terminated_length": 2395.0,
      "completions/mean_length": 712.73046875,
      "completions/mean_terminated_length": 712.73046875,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.3478685721265421,
      "grad_norm": 0.3427869494899167,
      "learning_rate": 1e-06,
      "loss": -0.0083,
      "num_tokens": 72500203.0,
      "reward": 0.7109375,
      "reward_std": 0.2330913245677948,
      "rewards/accuracy_reward/mean": 0.2109375,
      "rewards/accuracy_reward/std": 0.4083731174468994,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4155.0,
      "completions/max_terminated_length": 4155.0,
      "completions/mean_length": 799.75,
      "completions/mean_terminated_length": 799.75,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 0.34982288994747773,
      "grad_norm": 0.25141299436628006,
      "learning_rate": 1e-06,
      "loss": 0.0092,
      "num_tokens": 72986843.0,
      "reward": 0.6064453125,
      "reward_std": 0.1697743833065033,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3389.0,
      "completions/max_terminated_length": 3389.0,
      "completions/mean_length": 779.1171875,
      "completions/mean_terminated_length": 779.1171875,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.3517772077684133,
      "grad_norm": 0.29349116949044435,
      "learning_rate": 1e-06,
      "loss": -0.0031,
      "num_tokens": 73451527.0,
      "reward": 0.720703125,
      "reward_std": 0.26464205980300903,
      "rewards/accuracy_reward/mean": 0.220703125,
      "rewards/accuracy_reward/std": 0.4151262938976288,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3525.0,
      "completions/max_terminated_length": 3525.0,
      "completions/mean_length": 739.443359375,
      "completions/mean_terminated_length": 739.443359375,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 0.35373152558934895,
      "grad_norm": 0.3240617719772287,
      "learning_rate": 1e-06,
      "loss": 0.0163,
      "num_tokens": 73910714.0,
      "reward": 0.802734375,
      "reward_std": 0.31746405363082886,
      "rewards/accuracy_reward/mean": 0.3046875,
      "rewards/accuracy_reward/std": 0.4607250988483429,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3310.0,
      "completions/max_terminated_length": 3310.0,
      "completions/mean_length": 804.076171875,
      "completions/mean_terminated_length": 804.076171875,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.3556858434102846,
      "grad_norm": 0.23830613092555628,
      "learning_rate": 1e-06,
      "loss": 0.005,
      "num_tokens": 74379601.0,
      "reward": 0.623046875,
      "reward_std": 0.19262534379959106,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2351.0,
      "completions/max_terminated_length": 2351.0,
      "completions/mean_length": 851.955078125,
      "completions/mean_terminated_length": 851.955078125,
      "completions/min_length": 349.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.3576401612312202,
      "grad_norm": 0.26190342676179346,
      "learning_rate": 1e-06,
      "loss": 0.0073,
      "num_tokens": 74893274.0,
      "reward": 0.75,
      "reward_std": 0.22990617156028748,
      "rewards/accuracy_reward/mean": 0.25,
      "rewards/accuracy_reward/std": 0.43343618512153625,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2327.0,
      "completions/max_terminated_length": 2327.0,
      "completions/mean_length": 862.33984375,
      "completions/mean_terminated_length": 862.33984375,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.35959447905215586,
      "grad_norm": 0.2632879398895767,
      "learning_rate": 1e-06,
      "loss": 0.0137,
      "num_tokens": 75407912.0,
      "reward": 0.638671875,
      "reward_std": 0.17609556019306183,
      "rewards/accuracy_reward/mean": 0.138671875,
      "rewards/accuracy_reward/std": 0.34594178199768066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2741.0,
      "completions/max_terminated_length": 2741.0,
      "completions/mean_length": 820.552734375,
      "completions/mean_terminated_length": 820.552734375,
      "completions/min_length": 338.0,
      "completions/min_terminated_length": 338.0,
      "epoch": 0.3615487968730915,
      "grad_norm": 0.2863485310277067,
      "learning_rate": 1e-06,
      "loss": 0.0058,
      "num_tokens": 75898611.0,
      "reward": 0.7080078125,
      "reward_std": 0.25882139801979065,
      "rewards/accuracy_reward/mean": 0.208984375,
      "rewards/accuracy_reward/std": 0.40698084235191345,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4223.0,
      "completions/max_terminated_length": 4223.0,
      "completions/mean_length": 814.423828125,
      "completions/mean_terminated_length": 814.423828125,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.36350311469402713,
      "grad_norm": 0.2622039048191849,
      "learning_rate": 1e-06,
      "loss": -0.0053,
      "num_tokens": 76386588.0,
      "reward": 0.5869140625,
      "reward_std": 0.18172740936279297,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1821.0,
      "completions/max_terminated_length": 1821.0,
      "completions/mean_length": 829.587890625,
      "completions/mean_terminated_length": 829.587890625,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.36545743251496277,
      "grad_norm": 0.2732455139111148,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 76886505.0,
      "reward": 0.689453125,
      "reward_std": 0.23466821014881134,
      "rewards/accuracy_reward/mean": 0.189453125,
      "rewards/accuracy_reward/std": 0.3922513723373413,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4208.0,
      "completions/max_terminated_length": 4208.0,
      "completions/mean_length": 856.87109375,
      "completions/mean_terminated_length": 856.87109375,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.3674117503358984,
      "grad_norm": 0.2623055000011323,
      "learning_rate": 1e-06,
      "loss": 0.0129,
      "num_tokens": 77400583.0,
      "reward": 0.7314453125,
      "reward_std": 0.2852405905723572,
      "rewards/accuracy_reward/mean": 0.232421875,
      "rewards/accuracy_reward/std": 0.42278963327407837,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5640.0,
      "completions/max_terminated_length": 5640.0,
      "completions/mean_length": 853.30859375,
      "completions/mean_terminated_length": 853.30859375,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.369366068156834,
      "grad_norm": 0.3573629765618696,
      "learning_rate": 1e-06,
      "loss": 0.0153,
      "num_tokens": 77902117.0,
      "reward": 0.7265625,
      "reward_std": 0.2968667149543762,
      "rewards/accuracy_reward/mean": 0.234375,
      "rewards/accuracy_reward/std": 0.42402184009552,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.984375,
      "rewards/soft_format_reward/std": 0.12414088100194931,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3075.0,
      "completions/max_terminated_length": 3075.0,
      "completions/mean_length": 899.916015625,
      "completions/mean_terminated_length": 899.916015625,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.3713203859777696,
      "grad_norm": 0.19173275081974,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 78430490.0,
      "reward": 0.685546875,
      "reward_std": 0.1747143268585205,
      "rewards/accuracy_reward/mean": 0.1875,
      "rewards/accuracy_reward/std": 0.39069411158561707,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3782.0,
      "completions/max_terminated_length": 3782.0,
      "completions/mean_length": 955.080078125,
      "completions/mean_terminated_length": 955.080078125,
      "completions/min_length": 333.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 0.37327470379870525,
      "grad_norm": 0.23793975238468773,
      "learning_rate": 1e-06,
      "loss": 0.0299,
      "num_tokens": 78984067.0,
      "reward": 0.6748046875,
      "reward_std": 0.2005995362997055,
      "rewards/accuracy_reward/mean": 0.177734375,
      "rewards/accuracy_reward/std": 0.3826628625392914,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3498.0,
      "completions/max_terminated_length": 3498.0,
      "completions/mean_length": 866.150390625,
      "completions/mean_terminated_length": 866.150390625,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 0.3752290216196409,
      "grad_norm": 0.31379268532281196,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 79499888.0,
      "reward": 0.724609375,
      "reward_std": 0.19679194688796997,
      "rewards/accuracy_reward/mean": 0.228515625,
      "rewards/accuracy_reward/std": 0.4202871024608612,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2597.0,
      "completions/max_terminated_length": 2597.0,
      "completions/mean_length": 878.51953125,
      "completions/mean_terminated_length": 878.51953125,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.3771833394405765,
      "grad_norm": 0.24394454897717519,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 80015866.0,
      "reward": 0.6806640625,
      "reward_std": 0.18255063891410828,
      "rewards/accuracy_reward/mean": 0.181640625,
      "rewards/accuracy_reward/std": 0.38592514395713806,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3530.0,
      "completions/max_terminated_length": 3530.0,
      "completions/mean_length": 909.580078125,
      "completions/mean_terminated_length": 909.580078125,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 0.37913765726151216,
      "grad_norm": 0.18040807657100863,
      "learning_rate": 1e-06,
      "loss": 0.0072,
      "num_tokens": 80542451.0,
      "reward": 0.626953125,
      "reward_std": 0.13550108671188354,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3397.0,
      "completions/max_terminated_length": 3397.0,
      "completions/mean_length": 954.884765625,
      "completions/mean_terminated_length": 954.884765625,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 0.3810919750824478,
      "grad_norm": 0.22170464927593608,
      "learning_rate": 1e-06,
      "loss": 0.0059,
      "num_tokens": 81102424.0,
      "reward": 0.65234375,
      "reward_std": 0.15196657180786133,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2168.0,
      "completions/max_terminated_length": 2168.0,
      "completions/mean_length": 908.2578125,
      "completions/mean_terminated_length": 908.2578125,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 0.38304629290338343,
      "grad_norm": 0.23600614941725273,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 81646428.0,
      "reward": 0.68359375,
      "reward_std": 0.20152875781059265,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4084.0,
      "completions/max_terminated_length": 4084.0,
      "completions/mean_length": 945.404296875,
      "completions/mean_terminated_length": 945.404296875,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.38500061072431907,
      "grad_norm": 0.21988431449537316,
      "learning_rate": 1e-06,
      "loss": 0.0215,
      "num_tokens": 82207227.0,
      "reward": 0.8193359375,
      "reward_std": 0.17495843768119812,
      "rewards/accuracy_reward/mean": 0.3203125,
      "rewards/accuracy_reward/std": 0.4670529365539551,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3764.0,
      "completions/max_terminated_length": 3764.0,
      "completions/mean_length": 931.064453125,
      "completions/mean_terminated_length": 931.064453125,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.38695492854525465,
      "grad_norm": 0.25995575663674936,
      "learning_rate": 1e-06,
      "loss": -0.007,
      "num_tokens": 82754828.0,
      "reward": 0.669921875,
      "reward_std": 0.21521224081516266,
      "rewards/accuracy_reward/mean": 0.169921875,
      "rewards/accuracy_reward/std": 0.3759314715862274,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1944.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 884.419921875,
      "completions/mean_terminated_length": 884.419921875,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.3889092463661903,
      "grad_norm": 0.26268468182790095,
      "learning_rate": 1e-06,
      "loss": 0.0074,
      "num_tokens": 83296547.0,
      "reward": 0.572265625,
      "reward_std": 0.14881780743598938,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2847.0,
      "completions/max_terminated_length": 2847.0,
      "completions/mean_length": 955.845703125,
      "completions/mean_terminated_length": 955.845703125,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.3908635641871259,
      "grad_norm": 0.2971001211592783,
      "learning_rate": 1e-06,
      "loss": 0.0054,
      "num_tokens": 83874180.0,
      "reward": 0.734375,
      "reward_std": 0.26208168268203735,
      "rewards/accuracy_reward/mean": 0.236328125,
      "rewards/accuracy_reward/std": 0.42524150013923645,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3982.0,
      "completions/max_terminated_length": 3982.0,
      "completions/mean_length": 915.42578125,
      "completions/mean_terminated_length": 915.42578125,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.39281788200806156,
      "grad_norm": 0.2612289532160619,
      "learning_rate": 1e-06,
      "loss": 0.0175,
      "num_tokens": 84414638.0,
      "reward": 0.71875,
      "reward_std": 0.22146812081336975,
      "rewards/accuracy_reward/mean": 0.220703125,
      "rewards/accuracy_reward/std": 0.4151262938976288,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2352.0,
      "completions/max_terminated_length": 2352.0,
      "completions/mean_length": 952.732421875,
      "completions/mean_terminated_length": 952.732421875,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 0.3947721998289972,
      "grad_norm": 0.21818012447386279,
      "learning_rate": 1e-06,
      "loss": -0.0053,
      "num_tokens": 84968773.0,
      "reward": 0.66015625,
      "reward_std": 0.16679157316684723,
      "rewards/accuracy_reward/mean": 0.16015625,
      "rewards/accuracy_reward/std": 0.3671095669269562,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1861.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 987.447265625,
      "completions/mean_terminated_length": 987.447265625,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.39672651764993283,
      "grad_norm": 0.2513367951611963,
      "learning_rate": 1e-06,
      "loss": -0.0109,
      "num_tokens": 85548490.0,
      "reward": 0.615234375,
      "reward_std": 0.1888715773820877,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2748.0,
      "completions/max_terminated_length": 2748.0,
      "completions/mean_length": 1065.8203125,
      "completions/mean_terminated_length": 1065.8203125,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 0.39868083547086847,
      "grad_norm": 0.2557963082324739,
      "learning_rate": 1e-06,
      "loss": 0.0058,
      "num_tokens": 86170270.0,
      "reward": 0.666015625,
      "reward_std": 0.23651830852031708,
      "rewards/accuracy_reward/mean": 0.166015625,
      "rewards/accuracy_reward/std": 0.3724585771560669,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4625.0,
      "completions/max_terminated_length": 4625.0,
      "completions/mean_length": 945.232421875,
      "completions/mean_terminated_length": 945.232421875,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.4006351532918041,
      "grad_norm": 0.30834471563398924,
      "learning_rate": 1e-06,
      "loss": 0.0106,
      "num_tokens": 86720965.0,
      "reward": 0.7470703125,
      "reward_std": 0.246811181306839,
      "rewards/accuracy_reward/mean": 0.248046875,
      "rewards/accuracy_reward/std": 0.4323015511035919,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2356.0,
      "completions/max_terminated_length": 2356.0,
      "completions/mean_length": 956.16796875,
      "completions/mean_terminated_length": 956.16796875,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 0.4025894711127397,
      "grad_norm": 0.27542187853912603,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 87281467.0,
      "reward": 0.75,
      "reward_std": 0.29134687781333923,
      "rewards/accuracy_reward/mean": 0.25,
      "rewards/accuracy_reward/std": 0.43343618512153625,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4110.0,
      "completions/max_terminated_length": 4110.0,
      "completions/mean_length": 942.583984375,
      "completions/mean_terminated_length": 942.583984375,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.4045437889336753,
      "grad_norm": 0.30443486724056545,
      "learning_rate": 1e-06,
      "loss": 0.005,
      "num_tokens": 87835382.0,
      "reward": 0.8212890625,
      "reward_std": 0.3201446235179901,
      "rewards/accuracy_reward/mean": 0.322265625,
      "rewards/accuracy_reward/std": 0.46780112385749817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2104.0,
      "completions/max_terminated_length": 2104.0,
      "completions/mean_length": 827.787109375,
      "completions/mean_terminated_length": 827.787109375,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.40649810675461095,
      "grad_norm": 0.32313505262864856,
      "learning_rate": 1e-06,
      "loss": 0.0037,
      "num_tokens": 88322185.0,
      "reward": 0.810546875,
      "reward_std": 0.25759023427963257,
      "rewards/accuracy_reward/mean": 0.310546875,
      "rewards/accuracy_reward/std": 0.46317005157470703,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4301.0,
      "completions/max_terminated_length": 4301.0,
      "completions/mean_length": 1033.625,
      "completions/mean_terminated_length": 1033.625,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 0.4084524245755466,
      "grad_norm": 0.18325104023038696,
      "learning_rate": 1e-06,
      "loss": 0.0145,
      "num_tokens": 88916201.0,
      "reward": 0.734375,
      "reward_std": 0.14592814445495605,
      "rewards/accuracy_reward/mean": 0.236328125,
      "rewards/accuracy_reward/std": 0.42524150013923645,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3938.0,
      "completions/max_terminated_length": 3938.0,
      "completions/mean_length": 921.39453125,
      "completions/mean_terminated_length": 921.39453125,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.4104067423964822,
      "grad_norm": 0.28524825116918173,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 89451411.0,
      "reward": 0.7763671875,
      "reward_std": 0.2564077377319336,
      "rewards/accuracy_reward/mean": 0.27734375,
      "rewards/accuracy_reward/std": 0.4481254518032074,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4314.0,
      "completions/max_terminated_length": 4314.0,
      "completions/mean_length": 1045.392578125,
      "completions/mean_terminated_length": 1045.392578125,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.41236106021741786,
      "grad_norm": 0.19271881109079272,
      "learning_rate": 1e-06,
      "loss": 0.0211,
      "num_tokens": 90058780.0,
      "reward": 0.70703125,
      "reward_std": 0.1599920094013214,
      "rewards/accuracy_reward/mean": 0.208984375,
      "rewards/accuracy_reward/std": 0.40698084235191345,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4420.0,
      "completions/max_terminated_length": 4420.0,
      "completions/mean_length": 971.927734375,
      "completions/mean_terminated_length": 971.927734375,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 0.4143153780383535,
      "grad_norm": 0.2600337313478683,
      "learning_rate": 1e-06,
      "loss": 0.013,
      "num_tokens": 90616535.0,
      "reward": 0.6455078125,
      "reward_std": 0.21474801003932953,
      "rewards/accuracy_reward/mean": 0.146484375,
      "rewards/accuracy_reward/std": 0.35393697023391724,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4103.0,
      "completions/max_terminated_length": 4103.0,
      "completions/mean_length": 1011.65234375,
      "completions/mean_terminated_length": 1011.65234375,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 0.41626969585928913,
      "grad_norm": 0.21847161709190585,
      "learning_rate": 1e-06,
      "loss": -0.0045,
      "num_tokens": 91199557.0,
      "reward": 0.716796875,
      "reward_std": 0.2477385401725769,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3865.0,
      "completions/max_terminated_length": 3865.0,
      "completions/mean_length": 965.220703125,
      "completions/mean_terminated_length": 965.220703125,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 0.41822401368022477,
      "grad_norm": 0.22320366682605963,
      "learning_rate": 1e-06,
      "loss": 0.0254,
      "num_tokens": 91765190.0,
      "reward": 0.7041015625,
      "reward_std": 0.2076844871044159,
      "rewards/accuracy_reward/mean": 0.20703125,
      "rewards/accuracy_reward/std": 0.40557438135147095,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4404.0,
      "completions/max_terminated_length": 4404.0,
      "completions/mean_length": 1008.40625,
      "completions/mean_terminated_length": 1010.379638671875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.42017833150116035,
      "grad_norm": 0.28569975603493336,
      "learning_rate": 1e-06,
      "loss": 0.0177,
      "num_tokens": 92359750.0,
      "reward": 0.75390625,
      "reward_std": 0.30729252099990845,
      "rewards/accuracy_reward/mean": 0.259765625,
      "rewards/accuracy_reward/std": 0.4389347732067108,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98828125,
      "rewards/soft_format_reward/std": 0.10772226005792618,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2690.0,
      "completions/max_terminated_length": 2690.0,
      "completions/mean_length": 914.80859375,
      "completions/mean_terminated_length": 914.80859375,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.422132649322096,
      "grad_norm": 0.23027800733494036,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 92907172.0,
      "reward": 0.7529296875,
      "reward_std": 0.1622444987297058,
      "rewards/accuracy_reward/mean": 0.25390625,
      "rewards/accuracy_reward/std": 0.43567025661468506,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4533.0,
      "completions/max_terminated_length": 4533.0,
      "completions/mean_length": 942.66796875,
      "completions/mean_terminated_length": 942.66796875,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.4240869671430316,
      "grad_norm": 0.2574145708092522,
      "learning_rate": 1e-06,
      "loss": 0.0105,
      "num_tokens": 93459210.0,
      "reward": 0.7529296875,
      "reward_std": 0.22094133496284485,
      "rewards/accuracy_reward/mean": 0.25390625,
      "rewards/accuracy_reward/std": 0.43567025661468506,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2874.0,
      "completions/max_terminated_length": 2874.0,
      "completions/mean_length": 1006.265625,
      "completions/mean_terminated_length": 1006.265625,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 0.42604128496396726,
      "grad_norm": 0.17045379685947146,
      "learning_rate": 1e-06,
      "loss": -0.0045,
      "num_tokens": 94039442.0,
      "reward": 0.6181640625,
      "reward_std": 0.12891486287117004,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2209.0,
      "completions/max_terminated_length": 2209.0,
      "completions/mean_length": 970.173828125,
      "completions/mean_terminated_length": 970.173828125,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.4279956027849029,
      "grad_norm": 0.26850685850054384,
      "learning_rate": 1e-06,
      "loss": 0.0263,
      "num_tokens": 94612219.0,
      "reward": 0.78125,
      "reward_std": 0.3012031018733978,
      "rewards/accuracy_reward/mean": 0.28125,
      "rewards/accuracy_reward/std": 0.45004892349243164,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3581.0,
      "completions/max_terminated_length": 3581.0,
      "completions/mean_length": 1007.35546875,
      "completions/mean_terminated_length": 1007.35546875,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.42994992060583853,
      "grad_norm": 0.26864480682396996,
      "learning_rate": 1e-06,
      "loss": 0.009,
      "num_tokens": 95203873.0,
      "reward": 0.8642578125,
      "reward_std": 0.3184603750705719,
      "rewards/accuracy_reward/mean": 0.365234375,
      "rewards/accuracy_reward/std": 0.4819667339324951,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3821.0,
      "completions/max_terminated_length": 3821.0,
      "completions/mean_length": 1079.130859375,
      "completions/mean_terminated_length": 1079.130859375,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 0.43190423842677417,
      "grad_norm": 0.16386604992074785,
      "learning_rate": 1e-06,
      "loss": 0.0198,
      "num_tokens": 95823764.0,
      "reward": 0.6923828125,
      "reward_std": 0.161734938621521,
      "rewards/accuracy_reward/mean": 0.193359375,
      "rewards/accuracy_reward/std": 0.39531853795051575,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2110.0,
      "completions/max_terminated_length": 2110.0,
      "completions/mean_length": 989.015625,
      "completions/mean_terminated_length": 989.015625,
      "completions/min_length": 516.0,
      "completions/min_terminated_length": 516.0,
      "epoch": 0.4338585562477098,
      "grad_norm": 0.20953964134020223,
      "learning_rate": 1e-06,
      "loss": -0.0068,
      "num_tokens": 96399212.0,
      "reward": 0.7734375,
      "reward_std": 0.21356725692749023,
      "rewards/accuracy_reward/mean": 0.2734375,
      "rewards/accuracy_reward/std": 0.4461594223976135,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3732.0,
      "completions/max_terminated_length": 3732.0,
      "completions/mean_length": 1096.54296875,
      "completions/mean_terminated_length": 1096.54296875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.43581287406864544,
      "grad_norm": 0.19019372144990224,
      "learning_rate": 1e-06,
      "loss": 0.0071,
      "num_tokens": 97034434.0,
      "reward": 0.6591796875,
      "reward_std": 0.1866757571697235,
      "rewards/accuracy_reward/mean": 0.16015625,
      "rewards/accuracy_reward/std": 0.3671095669269562,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2360.0,
      "completions/max_terminated_length": 2360.0,
      "completions/mean_length": 1116.783203125,
      "completions/mean_terminated_length": 1116.783203125,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 0.437767191889581,
      "grad_norm": 0.18315269162731226,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 97681667.0,
      "reward": 0.71484375,
      "reward_std": 0.2196236550807953,
      "rewards/accuracy_reward/mean": 0.21484375,
      "rewards/accuracy_reward/std": 0.4111155867576599,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1895.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 849.51171875,
      "completions/mean_terminated_length": 849.51171875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.43972150971051666,
      "grad_norm": 0.24948571472846473,
      "learning_rate": 1e-06,
      "loss": 0.0054,
      "num_tokens": 98184201.0,
      "reward": 0.7734375,
      "reward_std": 0.24267366528511047,
      "rewards/accuracy_reward/mean": 0.2734375,
      "rewards/accuracy_reward/std": 0.4461594223976135,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1957.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 946.234375,
      "completions/mean_terminated_length": 946.234375,
      "completions/min_length": 290.0,
      "completions/min_terminated_length": 290.0,
      "epoch": 0.4416758275314523,
      "grad_norm": 0.24152675899148837,
      "learning_rate": 1e-06,
      "loss": -0.0047,
      "num_tokens": 98734961.0,
      "reward": 0.716796875,
      "reward_std": 0.26917481422424316,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2867.0,
      "completions/max_terminated_length": 2867.0,
      "completions/mean_length": 948.015625,
      "completions/mean_terminated_length": 948.015625,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 0.44363014535238793,
      "grad_norm": 0.22545582726917313,
      "learning_rate": 1e-06,
      "loss": -0.0017,
      "num_tokens": 99281865.0,
      "reward": 0.830078125,
      "reward_std": 0.3249583840370178,
      "rewards/accuracy_reward/mean": 0.330078125,
      "rewards/accuracy_reward/std": 0.47070086002349854,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4594.0,
      "completions/max_terminated_length": 4594.0,
      "completions/mean_length": 1066.33203125,
      "completions/mean_terminated_length": 1066.33203125,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.44558446317332356,
      "grad_norm": 0.2096555377981446,
      "learning_rate": 1e-06,
      "loss": 0.0201,
      "num_tokens": 99890819.0,
      "reward": 0.7158203125,
      "reward_std": 0.25165998935699463,
      "rewards/accuracy_reward/mean": 0.2265625,
      "rewards/accuracy_reward/std": 0.4190165400505066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.978515625,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3889.0,
      "completions/max_terminated_length": 3889.0,
      "completions/mean_length": 1005.283203125,
      "completions/mean_terminated_length": 1005.283203125,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.4475387809942592,
      "grad_norm": 0.19251257786284004,
      "learning_rate": 1e-06,
      "loss": 0.0076,
      "num_tokens": 100468452.0,
      "reward": 0.720703125,
      "reward_std": 0.24990171194076538,
      "rewards/accuracy_reward/mean": 0.22265625,
      "rewards/accuracy_reward/std": 0.41643625497817993,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3115.0,
      "completions/max_terminated_length": 3115.0,
      "completions/mean_length": 939.892578125,
      "completions/mean_terminated_length": 939.892578125,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 0.44949309881519484,
      "grad_norm": 0.2359322377049206,
      "learning_rate": 1e-06,
      "loss": 0.0194,
      "num_tokens": 101013965.0,
      "reward": 0.7138671875,
      "reward_std": 0.22999022901058197,
      "rewards/accuracy_reward/mean": 0.21484375,
      "rewards/accuracy_reward/std": 0.4111155867576599,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4053.0,
      "completions/max_terminated_length": 4053.0,
      "completions/mean_length": 984.09375,
      "completions/mean_terminated_length": 984.09375,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.45144741663613047,
      "grad_norm": 0.2865513700141309,
      "learning_rate": 1e-06,
      "loss": -0.0019,
      "num_tokens": 101596893.0,
      "reward": 0.681640625,
      "reward_std": 0.162692129611969,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3125.0,
      "completions/max_terminated_length": 3125.0,
      "completions/mean_length": 972.630859375,
      "completions/mean_terminated_length": 972.630859375,
      "completions/min_length": 452.0,
      "completions/min_terminated_length": 452.0,
      "epoch": 0.4534017344570661,
      "grad_norm": 0.1706451353142778,
      "learning_rate": 1e-06,
      "loss": 0.0134,
      "num_tokens": 102160656.0,
      "reward": 0.7021484375,
      "reward_std": 0.159878671169281,
      "rewards/accuracy_reward/mean": 0.203125,
      "rewards/accuracy_reward/std": 0.4027182459831238,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4431.0,
      "completions/max_terminated_length": 4431.0,
      "completions/mean_length": 1023.845703125,
      "completions/mean_terminated_length": 1023.845703125,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 0.4553560522780017,
      "grad_norm": 0.14864571037065305,
      "learning_rate": 1e-06,
      "loss": 0.0223,
      "num_tokens": 102750353.0,
      "reward": 0.60546875,
      "reward_std": 0.16349893808364868,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3224.0,
      "completions/max_terminated_length": 3224.0,
      "completions/mean_length": 942.154296875,
      "completions/mean_terminated_length": 942.154296875,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.4573103700989373,
      "grad_norm": 0.18473669788538932,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 103300384.0,
      "reward": 0.658203125,
      "reward_std": 0.17805281281471252,
      "rewards/accuracy_reward/mean": 0.158203125,
      "rewards/accuracy_reward/std": 0.36528825759887695,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3319.0,
      "completions/max_terminated_length": 3319.0,
      "completions/mean_length": 978.4921875,
      "completions/mean_terminated_length": 978.4921875,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.45926468791987296,
      "grad_norm": 0.19895438933076293,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 103866876.0,
      "reward": 0.6337890625,
      "reward_std": 0.1871449053287506,
      "rewards/accuracy_reward/mean": 0.134765625,
      "rewards/accuracy_reward/std": 0.3418070077896118,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3574.0,
      "completions/max_terminated_length": 3574.0,
      "completions/mean_length": 1034.4453125,
      "completions/mean_terminated_length": 1034.4453125,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.4612190057408086,
      "grad_norm": 0.19854174163197164,
      "learning_rate": 1e-06,
      "loss": 0.0146,
      "num_tokens": 104482608.0,
      "reward": 0.7099609375,
      "reward_std": 0.22323226928710938,
      "rewards/accuracy_reward/mean": 0.2109375,
      "rewards/accuracy_reward/std": 0.4083731174468994,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2528.0,
      "completions/max_terminated_length": 2528.0,
      "completions/mean_length": 1098.18359375,
      "completions/mean_terminated_length": 1098.18359375,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 0.46317332356174423,
      "grad_norm": 0.1821806828854234,
      "learning_rate": 1e-06,
      "loss": 0.0037,
      "num_tokens": 105137966.0,
      "reward": 0.7431640625,
      "reward_std": 0.26687362790107727,
      "rewards/accuracy_reward/mean": 0.244140625,
      "rewards/accuracy_reward/std": 0.42999663949012756,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3657.0,
      "completions/max_terminated_length": 3657.0,
      "completions/mean_length": 978.833984375,
      "completions/mean_terminated_length": 978.833984375,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 0.46512764138267987,
      "grad_norm": 0.20262259753735837,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 105717225.0,
      "reward": 0.69921875,
      "reward_std": 0.1814504861831665,
      "rewards/accuracy_reward/mean": 0.19921875,
      "rewards/accuracy_reward/std": 0.39980348944664,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5425.0,
      "completions/max_terminated_length": 5425.0,
      "completions/mean_length": 1133.865234375,
      "completions/mean_terminated_length": 1133.865234375,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.4670819592036155,
      "grad_norm": 0.1699007476515997,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 106362132.0,
      "reward": 0.701171875,
      "reward_std": 0.21961811184883118,
      "rewards/accuracy_reward/mean": 0.205078125,
      "rewards/accuracy_reward/std": 0.4041535556316376,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4108.0,
      "completions/max_terminated_length": 4108.0,
      "completions/mean_length": 951.15625,
      "completions/mean_terminated_length": 953.017578125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.46903627702455114,
      "grad_norm": 0.29621199324090425,
      "learning_rate": 1e-06,
      "loss": 0.0166,
      "num_tokens": 106923012.0,
      "reward": 0.79296875,
      "reward_std": 0.31659504771232605,
      "rewards/accuracy_reward/mean": 0.302734375,
      "rewards/accuracy_reward/std": 0.45989060401916504,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5658.0,
      "completions/max_terminated_length": 5658.0,
      "completions/mean_length": 1088.650390625,
      "completions/mean_terminated_length": 1088.650390625,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.4709905948454867,
      "grad_norm": 0.25561270272714015,
      "learning_rate": 1e-06,
      "loss": 0.0275,
      "num_tokens": 107561265.0,
      "reward": 0.7373046875,
      "reward_std": 0.2741406559944153,
      "rewards/accuracy_reward/mean": 0.248046875,
      "rewards/accuracy_reward/std": 0.4323015511035919,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.978515625,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 3788.0,
      "completions/max_terminated_length": 3788.0,
      "completions/mean_length": 1010.59375,
      "completions/mean_terminated_length": 1014.5569458007812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.47294491266642236,
      "grad_norm": 0.2485195300084967,
      "learning_rate": 1e-06,
      "loss": 0.0101,
      "num_tokens": 108146161.0,
      "reward": 0.7236328125,
      "reward_std": 0.23151344060897827,
      "rewards/accuracy_reward/mean": 0.2265625,
      "rewards/accuracy_reward/std": 0.4190165400505066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 4851.0,
      "completions/max_terminated_length": 4851.0,
      "completions/mean_length": 1111.775390625,
      "completions/mean_terminated_length": 1116.1353759765625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.474899230487358,
      "grad_norm": 0.2021770539335826,
      "learning_rate": 1e-06,
      "loss": -0.014,
      "num_tokens": 108785758.0,
      "reward": 0.7080078125,
      "reward_std": 0.26883962750434875,
      "rewards/accuracy_reward/mean": 0.212890625,
      "rewards/accuracy_reward/std": 0.409751296043396,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 7216.0,
      "completions/max_terminated_length": 7216.0,
      "completions/mean_length": 1114.83984375,
      "completions/mean_terminated_length": 1117.021484375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 572.0,
      "epoch": 0.47685354830829363,
      "grad_norm": 0.25711707470165174,
      "learning_rate": 1e-06,
      "loss": 0.0115,
      "num_tokens": 109440076.0,
      "reward": 0.71484375,
      "reward_std": 0.2208394706249237,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04296875,
      "completions/max_length": 7844.0,
      "completions/max_terminated_length": 7844.0,
      "completions/mean_length": 1116.5625,
      "completions/mean_terminated_length": 1166.69384765625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 0.47880786612922926,
      "grad_norm": 0.31192577891301276,
      "learning_rate": 1e-06,
      "loss": 0.0091,
      "num_tokens": 110091964.0,
      "reward": 0.6650390625,
      "reward_std": 0.2845654785633087,
      "rewards/accuracy_reward/mean": 0.173828125,
      "rewards/accuracy_reward/std": 0.3793322443962097,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.982421875,
      "rewards/soft_format_reward/std": 0.13154059648513794,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1015625,
      "completions/max_length": 7687.0,
      "completions/max_terminated_length": 7687.0,
      "completions/mean_length": 1083.162109375,
      "completions/mean_terminated_length": 1205.6064453125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 0.4807621839501649,
      "grad_norm": 0.3150070132679766,
      "learning_rate": 1e-06,
      "loss": 0.0324,
      "num_tokens": 110719583.0,
      "reward": 0.724609375,
      "reward_std": 0.2723310887813568,
      "rewards/accuracy_reward/mean": 0.234375,
      "rewards/accuracy_reward/std": 0.42402184009552,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 8160.0,
      "completions/max_terminated_length": 8160.0,
      "completions/mean_length": 1227.6328125,
      "completions/mean_terminated_length": 1454.9722900390625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 498.0,
      "epoch": 0.48271650177110054,
      "grad_norm": 0.24007897809165102,
      "learning_rate": 1e-06,
      "loss": 0.08,
      "num_tokens": 111427715.0,
      "reward": 0.7412109375,
      "reward_std": 0.3292117714881897,
      "rewards/accuracy_reward/mean": 0.2578125,
      "rewards/accuracy_reward/std": 0.43785804510116577,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.966796875,
      "rewards/soft_format_reward/std": 0.17934183776378632,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.228515625,
      "completions/max_length": 7801.0,
      "completions/max_terminated_length": 7801.0,
      "completions/mean_length": 1047.724609375,
      "completions/mean_terminated_length": 1358.0633544921875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 0.4846708195920362,
      "grad_norm": 0.3638344282061002,
      "learning_rate": 1e-06,
      "loss": 0.0748,
      "num_tokens": 112034038.0,
      "reward": 0.69921875,
      "reward_std": 0.3243100941181183,
      "rewards/accuracy_reward/mean": 0.224609375,
      "rewards/accuracy_reward/std": 0.41773295402526855,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.94921875,
      "rewards/soft_format_reward/std": 0.21976542472839355,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.173828125,
      "completions/max_length": 7779.0,
      "completions/max_terminated_length": 7779.0,
      "completions/mean_length": 1090.521484375,
      "completions/mean_terminated_length": 1319.96923828125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 0.4866251374129718,
      "grad_norm": 0.31731663593912496,
      "learning_rate": 1e-06,
      "loss": 0.0131,
      "num_tokens": 112654273.0,
      "reward": 0.818359375,
      "reward_std": 0.38540610671043396,
      "rewards/accuracy_reward/mean": 0.333984375,
      "rewards/accuracy_reward/std": 0.47209542989730835,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.96875,
      "rewards/soft_format_reward/std": 0.17416280508041382,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 7232.0,
      "completions/max_terminated_length": 7232.0,
      "completions/mean_length": 1152.212890625,
      "completions/mean_terminated_length": 1340.7568359375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 0.4885794552339074,
      "grad_norm": 0.39777837832174695,
      "learning_rate": 1e-06,
      "loss": 0.0329,
      "num_tokens": 113309134.0,
      "reward": 0.66796875,
      "reward_std": 0.26079481840133667,
      "rewards/accuracy_reward/mean": 0.1875,
      "rewards/accuracy_reward/std": 0.39069411158561707,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9609375,
      "rewards/soft_format_reward/std": 0.1939331740140915,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.142578125,
      "completions/max_length": 8176.0,
      "completions/max_terminated_length": 8176.0,
      "completions/mean_length": 1032.98046875,
      "completions/mean_terminated_length": 1204.751708984375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 0.490533773054843,
      "grad_norm": 0.26392963715768764,
      "learning_rate": 1e-06,
      "loss": 0.0337,
      "num_tokens": 113908852.0,
      "reward": 0.681640625,
      "reward_std": 0.2452651858329773,
      "rewards/accuracy_reward/mean": 0.193359375,
      "rewards/accuracy_reward/std": 0.39531853795051575,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9765625,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.076171875,
      "completions/max_length": 7773.0,
      "completions/max_terminated_length": 7773.0,
      "completions/mean_length": 1102.197265625,
      "completions/mean_terminated_length": 1193.0760498046875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.49248809087577866,
      "grad_norm": 0.27529383925423156,
      "learning_rate": 1e-06,
      "loss": 0.0246,
      "num_tokens": 114539913.0,
      "reward": 0.751953125,
      "reward_std": 0.31733620166778564,
      "rewards/accuracy_reward/mean": 0.263671875,
      "rewards/accuracy_reward/std": 0.4410543739795685,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9765625,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.048828125,
      "completions/max_length": 7395.0,
      "completions/max_terminated_length": 7395.0,
      "completions/mean_length": 1006.037109375,
      "completions/mean_terminated_length": 1057.6817626953125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 0.4944424086967143,
      "grad_norm": 0.2879368899238134,
      "learning_rate": 1e-06,
      "loss": 0.0509,
      "num_tokens": 115128316.0,
      "reward": 0.7939453125,
      "reward_std": 0.2664770185947418,
      "rewards/accuracy_reward/mean": 0.30078125,
      "rewards/accuracy_reward/std": 0.45904624462127686,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.986328125,
      "rewards/soft_format_reward/std": 0.1162383034825325,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05078125,
      "completions/max_length": 5063.0,
      "completions/max_terminated_length": 5063.0,
      "completions/mean_length": 962.44140625,
      "completions/mean_terminated_length": 1013.9299926757812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.49639672651764993,
      "grad_norm": 0.24299119734822017,
      "learning_rate": 1e-06,
      "loss": 0.0092,
      "num_tokens": 115689150.0,
      "reward": 0.6796875,
      "reward_std": 0.18629369139671326,
      "rewards/accuracy_reward/mean": 0.1875,
      "rewards/accuracy_reward/std": 0.39069411158561707,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.984375,
      "rewards/soft_format_reward/std": 0.12414088100194931,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013671875,
      "completions/max_length": 3467.0,
      "completions/max_terminated_length": 3467.0,
      "completions/mean_length": 1049.736328125,
      "completions/mean_terminated_length": 1064.287109375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 0.49835104433858557,
      "grad_norm": 0.3477404893024017,
      "learning_rate": 1e-06,
      "loss": 0.0185,
      "num_tokens": 116292535.0,
      "reward": 0.6904296875,
      "reward_std": 0.2398054003715515,
      "rewards/accuracy_reward/mean": 0.203125,
      "rewards/accuracy_reward/std": 0.4027182459831238,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.974609375,
      "rewards/soft_format_reward/std": 0.15746226906776428,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 3978.0,
      "completions/max_terminated_length": 3978.0,
      "completions/mean_length": 1005.923828125,
      "completions/mean_terminated_length": 1007.892333984375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.5003053621595211,
      "grad_norm": 0.2156115651026281,
      "learning_rate": 1e-06,
      "loss": 0.0148,
      "num_tokens": 116868304.0,
      "reward": 0.66796875,
      "reward_std": 0.23239204287528992,
      "rewards/accuracy_reward/mean": 0.171875,
      "rewards/accuracy_reward/std": 0.3776407241821289,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4048.0,
      "completions/max_terminated_length": 4048.0,
      "completions/mean_length": 1019.859375,
      "completions/mean_terminated_length": 1021.8551635742188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 0.5022596799804568,
      "grad_norm": 0.9399209758441138,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 117449320.0,
      "reward": 0.6611328125,
      "reward_std": 0.22365951538085938,
      "rewards/accuracy_reward/mean": 0.166015625,
      "rewards/accuracy_reward/std": 0.3724585771560669,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7381.0,
      "completions/max_terminated_length": 7381.0,
      "completions/mean_length": 969.4609375,
      "completions/mean_terminated_length": 969.4609375,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 0.5042139978013924,
      "grad_norm": 0.2244360966335621,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 118001092.0,
      "reward": 0.76171875,
      "reward_std": 0.22594638168811798,
      "rewards/accuracy_reward/mean": 0.263671875,
      "rewards/accuracy_reward/std": 0.4410543739795685,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3167.0,
      "completions/max_terminated_length": 3167.0,
      "completions/mean_length": 986.40234375,
      "completions/mean_terminated_length": 986.40234375,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.5061683156223281,
      "grad_norm": 0.29015636180825405,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 118571442.0,
      "reward": 0.72265625,
      "reward_std": 0.23174801468849182,
      "rewards/accuracy_reward/mean": 0.22265625,
      "rewards/accuracy_reward/std": 0.41643625497817993,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2125.0,
      "completions/max_terminated_length": 2125.0,
      "completions/mean_length": 1028.5,
      "completions/mean_terminated_length": 1028.5,
      "completions/min_length": 482.0,
      "completions/min_terminated_length": 482.0,
      "epoch": 0.5081226334432637,
      "grad_norm": 0.2176405700101328,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 119180418.0,
      "reward": 0.720703125,
      "reward_std": 0.2283822000026703,
      "rewards/accuracy_reward/mean": 0.220703125,
      "rewards/accuracy_reward/std": 0.4151262938976288,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4423.0,
      "completions/max_terminated_length": 4423.0,
      "completions/mean_length": 997.69921875,
      "completions/mean_terminated_length": 997.69921875,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.5100769512641994,
      "grad_norm": 0.22842908849300012,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 119768904.0,
      "reward": 0.83984375,
      "reward_std": 0.25301796197891235,
      "rewards/accuracy_reward/mean": 0.33984375,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4113.0,
      "completions/max_terminated_length": 4113.0,
      "completions/mean_length": 998.474609375,
      "completions/mean_terminated_length": 998.474609375,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 0.512031269085135,
      "grad_norm": 0.1840915908201,
      "learning_rate": 1e-06,
      "loss": -0.0025,
      "num_tokens": 120352107.0,
      "reward": 0.7314453125,
      "reward_std": 0.1765916347503662,
      "rewards/accuracy_reward/mean": 0.232421875,
      "rewards/accuracy_reward/std": 0.42278963327407837,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3735.0,
      "completions/max_terminated_length": 3735.0,
      "completions/mean_length": 985.5859375,
      "completions/mean_terminated_length": 985.5859375,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 0.5139855869060705,
      "grad_norm": 0.22731725715057885,
      "learning_rate": 1e-06,
      "loss": 0.0135,
      "num_tokens": 120921511.0,
      "reward": 0.734375,
      "reward_std": 0.27029913663864136,
      "rewards/accuracy_reward/mean": 0.234375,
      "rewards/accuracy_reward/std": 0.42402184009552,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3701.0,
      "completions/max_terminated_length": 3701.0,
      "completions/mean_length": 1013.005859375,
      "completions/mean_terminated_length": 1013.005859375,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 0.5159399047270062,
      "grad_norm": 0.3162499362581272,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 121500890.0,
      "reward": 0.7724609375,
      "reward_std": 0.2833487391471863,
      "rewards/accuracy_reward/mean": 0.2734375,
      "rewards/accuracy_reward/std": 0.4461594223976135,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3627.0,
      "completions/max_terminated_length": 3627.0,
      "completions/mean_length": 920.548828125,
      "completions/mean_terminated_length": 920.548828125,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.5178942225479418,
      "grad_norm": 0.23792807621234424,
      "learning_rate": 1e-06,
      "loss": 0.0103,
      "num_tokens": 122041043.0,
      "reward": 0.916015625,
      "reward_std": 0.28482675552368164,
      "rewards/accuracy_reward/mean": 0.416015625,
      "rewards/accuracy_reward/std": 0.493378221988678,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4471.0,
      "completions/max_terminated_length": 4471.0,
      "completions/mean_length": 1087.7578125,
      "completions/mean_terminated_length": 1087.7578125,
      "completions/min_length": 380.0,
      "completions/min_terminated_length": 380.0,
      "epoch": 0.5198485403688775,
      "grad_norm": 0.2552252902728524,
      "learning_rate": 1e-06,
      "loss": -0.0169,
      "num_tokens": 122679143.0,
      "reward": 0.8125,
      "reward_std": 0.21330136060714722,
      "rewards/accuracy_reward/mean": 0.3125,
      "rewards/accuracy_reward/std": 0.4639657139778137,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4663.0,
      "completions/max_terminated_length": 4663.0,
      "completions/mean_length": 994.794921875,
      "completions/mean_terminated_length": 996.74169921875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 0.5218028581898131,
      "grad_norm": 0.29560652769427187,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 123257486.0,
      "reward": 0.6318359375,
      "reward_std": 0.21056944131851196,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0078125,
      "completions/max_length": 4422.0,
      "completions/max_terminated_length": 4422.0,
      "completions/mean_length": 1029.32421875,
      "completions/mean_terminated_length": 1037.4290771484375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 0.5237571760107488,
      "grad_norm": 0.20298588197620862,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 123850324.0,
      "reward": 0.6572265625,
      "reward_std": 0.16812849044799805,
      "rewards/accuracy_reward/mean": 0.158203125,
      "rewards/accuracy_reward/std": 0.36528825759887695,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017578125,
      "completions/max_length": 7146.0,
      "completions/max_terminated_length": 7146.0,
      "completions/mean_length": 1039.353515625,
      "completions/mean_terminated_length": 1057.9501953125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.5257114938316844,
      "grad_norm": 0.2642426401711122,
      "learning_rate": 1e-06,
      "loss": 0.0004,
      "num_tokens": 124466681.0,
      "reward": 0.7431640625,
      "reward_std": 0.3039402961730957,
      "rewards/accuracy_reward/mean": 0.244140625,
      "rewards/accuracy_reward/std": 0.42999663949012756,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 8017.0,
      "completions/max_terminated_length": 8017.0,
      "completions/mean_length": 989.63671875,
      "completions/mean_terminated_length": 1005.3452758789062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.52766581165262,
      "grad_norm": 0.321186877566316,
      "learning_rate": 1e-06,
      "loss": 0.0055,
      "num_tokens": 125047343.0,
      "reward": 0.7265625,
      "reward_std": 0.27988117933273315,
      "rewards/accuracy_reward/mean": 0.2265625,
      "rewards/accuracy_reward/std": 0.4190165400505066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 8071.0,
      "completions/max_terminated_length": 8071.0,
      "completions/mean_length": 980.1796875,
      "completions/mean_terminated_length": 989.84619140625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 0.5296201294735556,
      "grad_norm": 0.25814862577428566,
      "learning_rate": 1e-06,
      "loss": 0.0095,
      "num_tokens": 125623019.0,
      "reward": 0.7734375,
      "reward_std": 0.2514013648033142,
      "rewards/accuracy_reward/mean": 0.2734375,
      "rewards/accuracy_reward/std": 0.4461594223976135,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 3870.0,
      "completions/max_terminated_length": 3870.0,
      "completions/mean_length": 899.365234375,
      "completions/mean_terminated_length": 901.125244140625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 0.5315744472944912,
      "grad_norm": 0.30822629477420593,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 126154806.0,
      "reward": 0.833984375,
      "reward_std": 0.24169795215129852,
      "rewards/accuracy_reward/mean": 0.333984375,
      "rewards/accuracy_reward/std": 0.47209542989730835,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 3584.0,
      "completions/max_terminated_length": 3584.0,
      "completions/mean_length": 885.96484375,
      "completions/mean_terminated_length": 887.6986083984375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.5335287651154269,
      "grad_norm": 0.4844193958853699,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 126681124.0,
      "reward": 0.8046875,
      "reward_std": 0.2762932777404785,
      "rewards/accuracy_reward/mean": 0.3046875,
      "rewards/accuracy_reward/std": 0.4607250988483429,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013671875,
      "completions/max_length": 2763.0,
      "completions/max_terminated_length": 2763.0,
      "completions/mean_length": 970.884765625,
      "completions/mean_terminated_length": 984.3425903320312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 0.5354830829363625,
      "grad_norm": 0.4122744154532177,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 127245113.0,
      "reward": 0.8466796875,
      "reward_std": 0.280442476272583,
      "rewards/accuracy_reward/mean": 0.34765625,
      "rewards/accuracy_reward/std": 0.47669193148612976,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 3716.0,
      "completions/max_terminated_length": 3716.0,
      "completions/mean_length": 973.455078125,
      "completions/mean_terminated_length": 977.2725830078125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 0.5374374007572982,
      "grad_norm": 0.4641599866232637,
      "learning_rate": 1e-06,
      "loss": 0.0078,
      "num_tokens": 127809442.0,
      "reward": 0.7626953125,
      "reward_std": 0.30328112840652466,
      "rewards/accuracy_reward/mean": 0.263671875,
      "rewards/accuracy_reward/std": 0.4410543739795685,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.005859375,
      "completions/max_length": 4072.0,
      "completions/max_terminated_length": 4072.0,
      "completions/mean_length": 953.28515625,
      "completions/mean_terminated_length": 958.9037475585938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.5393917185782338,
      "grad_norm": 0.4011218132397713,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 128359876.0,
      "reward": 0.837890625,
      "reward_std": 0.269453763961792,
      "rewards/accuracy_reward/mean": 0.337890625,
      "rewards/accuracy_reward/std": 0.4734536409378052,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1784.0,
      "completions/max_terminated_length": 1784.0,
      "completions/mean_length": 882.904296875,
      "completions/mean_terminated_length": 882.904296875,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.5413460363991695,
      "grad_norm": 0.5510919794170652,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 128872547.0,
      "reward": 0.69921875,
      "reward_std": 0.24932363629341125,
      "rewards/accuracy_reward/mean": 0.19921875,
      "rewards/accuracy_reward/std": 0.39980348944664,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2081.0,
      "completions/max_terminated_length": 2081.0,
      "completions/mean_length": 983.107421875,
      "completions/mean_terminated_length": 983.107421875,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 0.543300354220105,
      "grad_norm": 0.694320230931412,
      "learning_rate": 1e-06,
      "loss": 0.0058,
      "num_tokens": 129446282.0,
      "reward": 0.74609375,
      "reward_std": 0.2507278323173523,
      "rewards/accuracy_reward/mean": 0.24609375,
      "rewards/accuracy_reward/std": 0.4311550557613373,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 1908.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 924.23828125,
      "completions/mean_terminated_length": 926.0469360351562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.5452546720410407,
      "grad_norm": 0.28831960857551026,
      "learning_rate": 1e-06,
      "loss": -0.0036,
      "num_tokens": 129985492.0,
      "reward": 0.7939453125,
      "reward_std": 0.24243290722370148,
      "rewards/accuracy_reward/mean": 0.294921875,
      "rewards/accuracy_reward/std": 0.4564536213874817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 2227.0,
      "completions/max_terminated_length": 2227.0,
      "completions/mean_length": 973.123046875,
      "completions/mean_terminated_length": 975.0274047851562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.5472089898619763,
      "grad_norm": 0.45862523305426195,
      "learning_rate": 1e-06,
      "loss": -0.0055,
      "num_tokens": 130545859.0,
      "reward": 0.80078125,
      "reward_std": 0.22870126366615295,
      "rewards/accuracy_reward/mean": 0.30078125,
      "rewards/accuracy_reward/std": 0.45904624462127686,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 3546.0,
      "completions/max_terminated_length": 3546.0,
      "completions/mean_length": 886.798828125,
      "completions/mean_terminated_length": 890.2765502929688,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 0.5491633076829119,
      "grad_norm": 0.27981683414820635,
      "learning_rate": 1e-06,
      "loss": -0.004,
      "num_tokens": 131065628.0,
      "reward": 0.716796875,
      "reward_std": 0.24010811746120453,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 3222.0,
      "completions/max_terminated_length": 3222.0,
      "completions/mean_length": 923.705078125,
      "completions/mean_terminated_length": 932.8146362304688,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 0.5511176255038476,
      "grad_norm": 0.9012737221127962,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 131605637.0,
      "reward": 0.748046875,
      "reward_std": 0.19029605388641357,
      "rewards/accuracy_reward/mean": 0.248046875,
      "rewards/accuracy_reward/std": 0.4323015511035919,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4594.0,
      "completions/max_terminated_length": 4594.0,
      "completions/mean_length": 948.59765625,
      "completions/mean_terminated_length": 950.4539794921875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 0.5530719433247832,
      "grad_norm": 0.5741973088658604,
      "learning_rate": 1e-06,
      "loss": 0.0092,
      "num_tokens": 132156151.0,
      "reward": 0.716796875,
      "reward_std": 0.2526024580001831,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1991.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 893.158203125,
      "completions/mean_terminated_length": 893.158203125,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 0.5550262611457188,
      "grad_norm": 0.3979017319882626,
      "learning_rate": 1e-06,
      "loss": -0.0045,
      "num_tokens": 132679368.0,
      "reward": 0.673828125,
      "reward_std": 0.20905625820159912,
      "rewards/accuracy_reward/mean": 0.173828125,
      "rewards/accuracy_reward/std": 0.3793322443962097,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 1730.0,
      "completions/max_terminated_length": 1730.0,
      "completions/mean_length": 949.421875,
      "completions/mean_terminated_length": 951.2798461914062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.5569805789666544,
      "grad_norm": 0.6711761247167988,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 133230224.0,
      "reward": 0.7265625,
      "reward_std": 0.267217218875885,
      "rewards/accuracy_reward/mean": 0.2265625,
      "rewards/accuracy_reward/std": 0.4190165400505066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4416.0,
      "completions/max_terminated_length": 4416.0,
      "completions/mean_length": 976.94921875,
      "completions/mean_terminated_length": 976.94921875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.5589348967875901,
      "grad_norm": 0.2792434772760303,
      "learning_rate": 1e-06,
      "loss": 0.0073,
      "num_tokens": 133793878.0,
      "reward": 0.802734375,
      "reward_std": 0.2886734902858734,
      "rewards/accuracy_reward/mean": 0.302734375,
      "rewards/accuracy_reward/std": 0.45989060401916504,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.005859375,
      "completions/max_length": 7203.0,
      "completions/max_terminated_length": 7203.0,
      "completions/mean_length": 946.064453125,
      "completions/mean_terminated_length": 951.6405029296875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.5608892146085257,
      "grad_norm": 1.9373572092283484,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 134348103.0,
      "reward": 0.7109375,
      "reward_std": 0.2510862350463867,
      "rewards/accuracy_reward/mean": 0.2109375,
      "rewards/accuracy_reward/std": 0.4083731174468994,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 2097.0,
      "completions/max_terminated_length": 2097.0,
      "completions/mean_length": 954.208984375,
      "completions/mean_terminated_length": 957.9510498046875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 377.0,
      "epoch": 0.5628435324294614,
      "grad_norm": 24.67543527023826,
      "learning_rate": 1e-06,
      "loss": -0.0175,
      "num_tokens": 134902098.0,
      "reward": 0.68359375,
      "reward_std": 0.1904578059911728,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 7087.0,
      "completions/max_terminated_length": 7087.0,
      "completions/mean_length": 915.7578125,
      "completions/mean_terminated_length": 919.3490600585938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.564797850250397,
      "grad_norm": 4.570825105490816,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 135432070.0,
      "reward": 0.712890625,
      "reward_std": 0.22883236408233643,
      "rewards/accuracy_reward/mean": 0.212890625,
      "rewards/accuracy_reward/std": 0.409751296043396,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1828.0,
      "completions/max_terminated_length": 1828.0,
      "completions/mean_length": 918.921875,
      "completions/mean_terminated_length": 918.921875,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 0.5667521680713326,
      "grad_norm": 0.6554731871867421,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 135981006.0,
      "reward": 0.822265625,
      "reward_std": 0.24702796339988708,
      "rewards/accuracy_reward/mean": 0.322265625,
      "rewards/accuracy_reward/std": 0.46780112385749817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4170.0,
      "completions/max_terminated_length": 4170.0,
      "completions/mean_length": 1039.828125,
      "completions/mean_terminated_length": 1041.863037109375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 0.5687064858922682,
      "grad_norm": 0.36442645531799234,
      "learning_rate": 1e-06,
      "loss": -0.0086,
      "num_tokens": 136583926.0,
      "reward": 0.7509765625,
      "reward_std": 0.2140166163444519,
      "rewards/accuracy_reward/mean": 0.251953125,
      "rewards/accuracy_reward/std": 0.43455907702445984,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4348.0,
      "completions/max_terminated_length": 4348.0,
      "completions/mean_length": 1105.818359375,
      "completions/mean_terminated_length": 1105.818359375,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 0.5706608037132038,
      "grad_norm": 0.5143258161999796,
      "learning_rate": 1e-06,
      "loss": 0.0115,
      "num_tokens": 137219849.0,
      "reward": 0.6630859375,
      "reward_std": 0.2245439738035202,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2486.0,
      "completions/max_terminated_length": 2486.0,
      "completions/mean_length": 930.1328125,
      "completions/mean_terminated_length": 930.1328125,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 0.5726151215341395,
      "grad_norm": 0.28065735282251886,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 137788365.0,
      "reward": 0.916015625,
      "reward_std": 0.3028775751590729,
      "rewards/accuracy_reward/mean": 0.416015625,
      "rewards/accuracy_reward/std": 0.493378221988678,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3869.0,
      "completions/max_terminated_length": 3869.0,
      "completions/mean_length": 1040.60546875,
      "completions/mean_terminated_length": 1040.60546875,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.5745694393550751,
      "grad_norm": 0.2291918759972196,
      "learning_rate": 1e-06,
      "loss": 0.0189,
      "num_tokens": 138395043.0,
      "reward": 0.8076171875,
      "reward_std": 0.2563822567462921,
      "rewards/accuracy_reward/mean": 0.30859375,
      "rewards/accuracy_reward/std": 0.4623647928237915,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4291.0,
      "completions/max_terminated_length": 4291.0,
      "completions/mean_length": 996.271484375,
      "completions/mean_terminated_length": 996.271484375,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 0.5765237571760108,
      "grad_norm": 0.2593726473999473,
      "learning_rate": 1e-06,
      "loss": 0.0148,
      "num_tokens": 138982126.0,
      "reward": 0.873046875,
      "reward_std": 0.2986268997192383,
      "rewards/accuracy_reward/mean": 0.373046875,
      "rewards/accuracy_reward/std": 0.48408737778663635,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3692.0,
      "completions/max_terminated_length": 3692.0,
      "completions/mean_length": 922.37890625,
      "completions/mean_terminated_length": 922.37890625,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.5784780749969464,
      "grad_norm": 0.24902058148005377,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 139519552.0,
      "reward": 0.78515625,
      "reward_std": 0.21863040328025818,
      "rewards/accuracy_reward/mean": 0.28515625,
      "rewards/accuracy_reward/std": 0.45193037390708923,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4822.0,
      "completions/max_terminated_length": 4822.0,
      "completions/mean_length": 1075.9609375,
      "completions/mean_terminated_length": 1075.9609375,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.5804323928178821,
      "grad_norm": 0.20053619451889154,
      "learning_rate": 1e-06,
      "loss": -0.0074,
      "num_tokens": 140135452.0,
      "reward": 0.6337890625,
      "reward_std": 0.21054817736148834,
      "rewards/accuracy_reward/mean": 0.134765625,
      "rewards/accuracy_reward/std": 0.3418070077896118,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3689.0,
      "completions/max_terminated_length": 3689.0,
      "completions/mean_length": 1006.880859375,
      "completions/mean_terminated_length": 1006.880859375,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.5823867106388176,
      "grad_norm": 0.2721278374748833,
      "learning_rate": 1e-06,
      "loss": 0.0105,
      "num_tokens": 140735999.0,
      "reward": 0.896484375,
      "reward_std": 0.30730587244033813,
      "rewards/accuracy_reward/mean": 0.396484375,
      "rewards/accuracy_reward/std": 0.4896455705165863,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2951.0,
      "completions/max_terminated_length": 2951.0,
      "completions/mean_length": 915.736328125,
      "completions/mean_terminated_length": 915.736328125,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.5843410284597532,
      "grad_norm": 0.2755522670107793,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 141269288.0,
      "reward": 0.974609375,
      "reward_std": 0.32747682929039,
      "rewards/accuracy_reward/mean": 0.474609375,
      "rewards/accuracy_reward/std": 0.4998432695865631,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3861.0,
      "completions/max_terminated_length": 3861.0,
      "completions/mean_length": 941.318359375,
      "completions/mean_terminated_length": 941.318359375,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.5862953462806889,
      "grad_norm": 0.2648539790797084,
      "learning_rate": 1e-06,
      "loss": 0.016,
      "num_tokens": 141842187.0,
      "reward": 0.9482421875,
      "reward_std": 0.3025415539741516,
      "rewards/accuracy_reward/mean": 0.44921875,
      "rewards/accuracy_reward/std": 0.497901052236557,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2542.0,
      "completions/max_terminated_length": 2542.0,
      "completions/mean_length": 895.84375,
      "completions/mean_terminated_length": 895.84375,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 0.5882496641016245,
      "grad_norm": 0.30717603139617383,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 142376779.0,
      "reward": 0.955078125,
      "reward_std": 0.27775871753692627,
      "rewards/accuracy_reward/mean": 0.455078125,
      "rewards/accuracy_reward/std": 0.4984649419784546,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3184.0,
      "completions/max_terminated_length": 3184.0,
      "completions/mean_length": 921.248046875,
      "completions/mean_terminated_length": 921.248046875,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.5902039819225602,
      "grad_norm": 0.26303870393487144,
      "learning_rate": 1e-06,
      "loss": -0.0023,
      "num_tokens": 142914010.0,
      "reward": 0.927734375,
      "reward_std": 0.28719258308410645,
      "rewards/accuracy_reward/mean": 0.427734375,
      "rewards/accuracy_reward/std": 0.4952339828014374,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 6394.0,
      "completions/max_terminated_length": 6394.0,
      "completions/mean_length": 1127.466796875,
      "completions/mean_terminated_length": 1131.8883056640625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.5921582997434958,
      "grad_norm": 0.1859365526140732,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 143563609.0,
      "reward": 0.78125,
      "reward_std": 0.3581286370754242,
      "rewards/accuracy_reward/mean": 0.28125,
      "rewards/accuracy_reward/std": 0.45004892349243164,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4257.0,
      "completions/max_terminated_length": 4257.0,
      "completions/mean_length": 1075.068359375,
      "completions/mean_terminated_length": 1077.1722412109375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.5941126175644315,
      "grad_norm": 0.20976677263274324,
      "learning_rate": 1e-06,
      "loss": -0.0075,
      "num_tokens": 144184732.0,
      "reward": 0.794921875,
      "reward_std": 0.2793903350830078,
      "rewards/accuracy_reward/mean": 0.294921875,
      "rewards/accuracy_reward/std": 0.4564536213874817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.02734375,
      "completions/max_length": 4493.0,
      "completions/max_terminated_length": 4493.0,
      "completions/mean_length": 1056.142578125,
      "completions/mean_terminated_length": 1085.833251953125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 0.596066935385367,
      "grad_norm": 0.2370549287713461,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 144796245.0,
      "reward": 0.8154296875,
      "reward_std": 0.29888656735420227,
      "rewards/accuracy_reward/mean": 0.31640625,
      "rewards/accuracy_reward/std": 0.46552830934524536,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 8155.0,
      "completions/max_terminated_length": 8155.0,
      "completions/mean_length": 1055.77734375,
      "completions/mean_terminated_length": 1126.16259765625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.5980212532063027,
      "grad_norm": 0.2677183667506309,
      "learning_rate": 1e-06,
      "loss": -0.0363,
      "num_tokens": 145422675.0,
      "reward": 0.763671875,
      "reward_std": 0.36193913221359253,
      "rewards/accuracy_reward/mean": 0.26953125,
      "rewards/accuracy_reward/std": 0.44415023922920227,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98828125,
      "rewards/soft_format_reward/std": 0.10772226005792618,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 4857.0,
      "completions/max_terminated_length": 4857.0,
      "completions/mean_length": 1103.701171875,
      "completions/mean_terminated_length": 1121.2203369140625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 0.5999755710272383,
      "grad_norm": 0.20625630984182472,
      "learning_rate": 1e-06,
      "loss": -0.0057,
      "num_tokens": 146060442.0,
      "reward": 0.771484375,
      "reward_std": 0.2951411008834839,
      "rewards/accuracy_reward/mean": 0.271484375,
      "rewards/accuracy_reward/std": 0.44516023993492126,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017578125,
      "completions/max_length": 3827.0,
      "completions/max_terminated_length": 3827.0,
      "completions/mean_length": 1129.08203125,
      "completions/mean_terminated_length": 1149.2843017578125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 0.6019298888481739,
      "grad_norm": 0.22505773820935762,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 146717188.0,
      "reward": 0.814453125,
      "reward_std": 0.31528496742248535,
      "rewards/accuracy_reward/mean": 0.314453125,
      "rewards/accuracy_reward/std": 0.4647517800331116,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0078125,
      "completions/max_length": 4405.0,
      "completions/max_terminated_length": 4405.0,
      "completions/mean_length": 1142.482421875,
      "completions/mean_terminated_length": 1151.4783935546875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 539.0,
      "epoch": 0.6038842066691096,
      "grad_norm": 0.1971569822415046,
      "learning_rate": 1e-06,
      "loss": 0.0104,
      "num_tokens": 147377931.0,
      "reward": 0.859375,
      "reward_std": 0.3297439515590668,
      "rewards/accuracy_reward/mean": 0.359375,
      "rewards/accuracy_reward/std": 0.48028653860092163,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5132.0,
      "completions/max_terminated_length": 5132.0,
      "completions/mean_length": 1025.68359375,
      "completions/mean_terminated_length": 1025.68359375,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 0.6058385244900452,
      "grad_norm": 0.2330771803944694,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 147970057.0,
      "reward": 0.884765625,
      "reward_std": 0.31399965286254883,
      "rewards/accuracy_reward/mean": 0.384765625,
      "rewards/accuracy_reward/std": 0.4870156943798065,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 2709.0,
      "completions/max_terminated_length": 2709.0,
      "completions/mean_length": 965.142578125,
      "completions/mean_terminated_length": 967.0313110351562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.6077928423109809,
      "grad_norm": 0.2278993739996118,
      "learning_rate": 1e-06,
      "loss": 0.0144,
      "num_tokens": 148527714.0,
      "reward": 0.8203125,
      "reward_std": 0.3155643939971924,
      "rewards/accuracy_reward/mean": 0.3203125,
      "rewards/accuracy_reward/std": 0.4670529365539551,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4086.0,
      "completions/max_terminated_length": 4086.0,
      "completions/mean_length": 1019.064453125,
      "completions/mean_terminated_length": 1019.064453125,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.6097471601319164,
      "grad_norm": 0.21361335909439352,
      "learning_rate": 1e-06,
      "loss": -0.0124,
      "num_tokens": 149118387.0,
      "reward": 0.841796875,
      "reward_std": 0.23987311124801636,
      "rewards/accuracy_reward/mean": 0.341796875,
      "rewards/accuracy_reward/std": 0.4747757613658905,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3634.0,
      "completions/max_terminated_length": 3634.0,
      "completions/mean_length": 1108.998046875,
      "completions/mean_terminated_length": 1108.998046875,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.6117014779528521,
      "grad_norm": 0.18715575552255137,
      "learning_rate": 1e-06,
      "loss": 0.0094,
      "num_tokens": 149756690.0,
      "reward": 0.7890625,
      "reward_std": 0.28230345249176025,
      "rewards/accuracy_reward/mean": 0.2890625,
      "rewards/accuracy_reward/std": 0.45377036929130554,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4580.0,
      "completions/max_terminated_length": 4580.0,
      "completions/mean_length": 1173.43359375,
      "completions/mean_terminated_length": 1173.43359375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.6136557957737877,
      "grad_norm": 0.19816350569687385,
      "learning_rate": 1e-06,
      "loss": 0.0123,
      "num_tokens": 150421936.0,
      "reward": 0.81640625,
      "reward_std": 0.3708382844924927,
      "rewards/accuracy_reward/mean": 0.31640625,
      "rewards/accuracy_reward/std": 0.46552830934524536,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4249.0,
      "completions/max_terminated_length": 4249.0,
      "completions/mean_length": 1102.232421875,
      "completions/mean_terminated_length": 1102.232421875,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.6156101135947234,
      "grad_norm": 0.21601955360633918,
      "learning_rate": 1e-06,
      "loss": 0.0142,
      "num_tokens": 151050535.0,
      "reward": 0.91796875,
      "reward_std": 0.3318660855293274,
      "rewards/accuracy_reward/mean": 0.419921875,
      "rewards/accuracy_reward/std": 0.4940285086631775,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2819.0,
      "completions/max_terminated_length": 2819.0,
      "completions/mean_length": 1078.234375,
      "completions/mean_terminated_length": 1078.234375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.617564431415659,
      "grad_norm": 0.2283945434889413,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 151672575.0,
      "reward": 0.8828125,
      "reward_std": 0.34946271777153015,
      "rewards/accuracy_reward/mean": 0.3828125,
      "rewards/accuracy_reward/std": 0.486548513174057,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5227.0,
      "completions/max_terminated_length": 5227.0,
      "completions/mean_length": 1081.626953125,
      "completions/mean_terminated_length": 1081.626953125,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.6195187492365946,
      "grad_norm": 0.20041741622788212,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 152287872.0,
      "reward": 0.84765625,
      "reward_std": 0.21362388134002686,
      "rewards/accuracy_reward/mean": 0.34765625,
      "rewards/accuracy_reward/std": 0.47669193148612976,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4295.0,
      "completions/max_terminated_length": 4295.0,
      "completions/mean_length": 1044.775390625,
      "completions/mean_terminated_length": 1044.775390625,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 0.6214730670575302,
      "grad_norm": 0.19355071134987356,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 152890317.0,
      "reward": 0.8798828125,
      "reward_std": 0.26457107067108154,
      "rewards/accuracy_reward/mean": 0.380859375,
      "rewards/accuracy_reward/std": 0.48607301712036133,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3944.0,
      "completions/max_terminated_length": 3944.0,
      "completions/mean_length": 1091.482421875,
      "completions/mean_terminated_length": 1091.482421875,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 0.6234273848784658,
      "grad_norm": 0.2167859797293233,
      "learning_rate": 1e-06,
      "loss": -0.0026,
      "num_tokens": 153512580.0,
      "reward": 0.85546875,
      "reward_std": 0.3550078868865967,
      "rewards/accuracy_reward/mean": 0.35546875,
      "rewards/accuracy_reward/std": 0.47912323474884033,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4463.0,
      "completions/max_terminated_length": 4463.0,
      "completions/mean_length": 1125.564453125,
      "completions/mean_terminated_length": 1125.564453125,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.6253817026994015,
      "grad_norm": 0.18167371714427352,
      "learning_rate": 1e-06,
      "loss": 0.0382,
      "num_tokens": 154150677.0,
      "reward": 0.98046875,
      "reward_std": 0.28993624448776245,
      "rewards/accuracy_reward/mean": 0.484375,
      "rewards/accuracy_reward/std": 0.5002445578575134,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4365.0,
      "completions/max_terminated_length": 4365.0,
      "completions/mean_length": 1042.69921875,
      "completions/mean_terminated_length": 1044.73974609375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.6273360205203371,
      "grad_norm": 0.21140532926422836,
      "learning_rate": 1e-06,
      "loss": 0.0209,
      "num_tokens": 154743723.0,
      "reward": 0.9814453125,
      "reward_std": 0.29769521951675415,
      "rewards/accuracy_reward/mean": 0.482421875,
      "rewards/accuracy_reward/std": 0.5001795887947083,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 3862.0,
      "completions/max_terminated_length": 3862.0,
      "completions/mean_length": 940.111328125,
      "completions/mean_terminated_length": 941.9510498046875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.6292903383412728,
      "grad_norm": 0.20156774180148904,
      "learning_rate": 1e-06,
      "loss": 0.0096,
      "num_tokens": 155284372.0,
      "reward": 1.0185546875,
      "reward_std": 0.2644067108631134,
      "rewards/accuracy_reward/mean": 0.51953125,
      "rewards/accuracy_reward/std": 0.5001069903373718,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4154.0,
      "completions/max_terminated_length": 4154.0,
      "completions/mean_length": 951.70703125,
      "completions/mean_terminated_length": 951.70703125,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.6312446561622084,
      "grad_norm": 0.19357741577941043,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 155826622.0,
      "reward": 1.0458984375,
      "reward_std": 0.27975112199783325,
      "rewards/accuracy_reward/mean": 0.546875,
      "rewards/accuracy_reward/std": 0.4982847273349762,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4951.0,
      "completions/max_terminated_length": 4951.0,
      "completions/mean_length": 977.861328125,
      "completions/mean_terminated_length": 977.861328125,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.633198973983144,
      "grad_norm": 0.25662343345447175,
      "learning_rate": 1e-06,
      "loss": 0.0051,
      "num_tokens": 156388663.0,
      "reward": 0.8388671875,
      "reward_std": 0.306870698928833,
      "rewards/accuracy_reward/mean": 0.33984375,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4168.0,
      "completions/max_terminated_length": 4168.0,
      "completions/mean_length": 995.265625,
      "completions/mean_terminated_length": 997.2133178710938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 0.6351532918040796,
      "grad_norm": 0.2527045594097066,
      "learning_rate": 1e-06,
      "loss": 0.012,
      "num_tokens": 156965023.0,
      "reward": 0.921875,
      "reward_std": 0.2805803716182709,
      "rewards/accuracy_reward/mean": 0.423828125,
      "rewards/accuracy_reward/std": 0.4946470856666565,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3506.0,
      "completions/max_terminated_length": 3506.0,
      "completions/mean_length": 932.640625,
      "completions/mean_terminated_length": 932.640625,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 0.6371076096250152,
      "grad_norm": 0.2431799639626169,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 157506455.0,
      "reward": 0.896484375,
      "reward_std": 0.28797444701194763,
      "rewards/accuracy_reward/mean": 0.396484375,
      "rewards/accuracy_reward/std": 0.4896455705165863,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4566.0,
      "completions/max_terminated_length": 4566.0,
      "completions/mean_length": 1044.732421875,
      "completions/mean_terminated_length": 1044.732421875,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.6390619274459509,
      "grad_norm": 0.20157542110570098,
      "learning_rate": 1e-06,
      "loss": 0.022,
      "num_tokens": 158095806.0,
      "reward": 0.86328125,
      "reward_std": 0.2860869765281677,
      "rewards/accuracy_reward/mean": 0.36328125,
      "rewards/accuracy_reward/std": 0.4814152419567108,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2673.0,
      "completions/max_terminated_length": 2673.0,
      "completions/mean_length": 960.6015625,
      "completions/mean_terminated_length": 960.6015625,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.6410162452668865,
      "grad_norm": 0.20509448699954397,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 158656114.0,
      "reward": 0.90625,
      "reward_std": 0.32110393047332764,
      "rewards/accuracy_reward/mean": 0.40625,
      "rewards/accuracy_reward/std": 0.49161264300346375,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1927.0,
      "completions/max_terminated_length": 1927.0,
      "completions/mean_length": 1042.87890625,
      "completions/mean_terminated_length": 1042.87890625,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.6429705630878222,
      "grad_norm": 0.16857134446456032,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 159252868.0,
      "reward": 0.91015625,
      "reward_std": 0.27819085121154785,
      "rewards/accuracy_reward/mean": 0.41015625,
      "rewards/accuracy_reward/std": 0.49234291911125183,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4079.0,
      "completions/max_terminated_length": 4079.0,
      "completions/mean_length": 919.9296875,
      "completions/mean_terminated_length": 919.9296875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 0.6449248809087578,
      "grad_norm": 0.22150563572303353,
      "learning_rate": 1e-06,
      "loss": 0.0112,
      "num_tokens": 159783200.0,
      "reward": 0.974609375,
      "reward_std": 0.32176676392555237,
      "rewards/accuracy_reward/mean": 0.474609375,
      "rewards/accuracy_reward/std": 0.4998432695865631,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8046.0,
      "completions/max_terminated_length": 8046.0,
      "completions/mean_length": 1018.91796875,
      "completions/mean_terminated_length": 1018.91796875,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.6468791987296935,
      "grad_norm": 0.20252483124411913,
      "learning_rate": 1e-06,
      "loss": 0.0138,
      "num_tokens": 160361974.0,
      "reward": 0.962890625,
      "reward_std": 0.2582211494445801,
      "rewards/accuracy_reward/mean": 0.462890625,
      "rewards/accuracy_reward/std": 0.4991086423397064,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3622.0,
      "completions/max_terminated_length": 3622.0,
      "completions/mean_length": 1080.962890625,
      "completions/mean_terminated_length": 1080.962890625,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 0.648833516550629,
      "grad_norm": 0.23036417412465934,
      "learning_rate": 1e-06,
      "loss": 0.0296,
      "num_tokens": 160975283.0,
      "reward": 0.986328125,
      "reward_std": 0.39269354939460754,
      "rewards/accuracy_reward/mean": 0.486328125,
      "rewards/accuracy_reward/std": 0.5003018379211426,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4387.0,
      "completions/max_terminated_length": 4387.0,
      "completions/mean_length": 1199.365234375,
      "completions/mean_terminated_length": 1199.365234375,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.6507878343715646,
      "grad_norm": 0.16341466775437016,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 161656366.0,
      "reward": 0.87109375,
      "reward_std": 0.28486648201942444,
      "rewards/accuracy_reward/mean": 0.37109375,
      "rewards/accuracy_reward/std": 0.4835699498653412,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4626.0,
      "completions/max_terminated_length": 4626.0,
      "completions/mean_length": 1042.716796875,
      "completions/mean_terminated_length": 1042.716796875,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 0.6527421521925003,
      "grad_norm": 0.17750826738799025,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 162253037.0,
      "reward": 0.935546875,
      "reward_std": 0.2184910923242569,
      "rewards/accuracy_reward/mean": 0.435546875,
      "rewards/accuracy_reward/std": 0.49631330370903015,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4957.0,
      "completions/max_terminated_length": 4957.0,
      "completions/mean_length": 968.923828125,
      "completions/mean_terminated_length": 968.923828125,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.6546964700134359,
      "grad_norm": 0.1861420754343207,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 162810614.0,
      "reward": 0.966796875,
      "reward_std": 0.25689640641212463,
      "rewards/accuracy_reward/mean": 0.466796875,
      "rewards/accuracy_reward/std": 0.4993842542171478,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3891.0,
      "completions/max_terminated_length": 3891.0,
      "completions/mean_length": 974.318359375,
      "completions/mean_terminated_length": 974.318359375,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.6566507878343716,
      "grad_norm": 0.2244008002693882,
      "learning_rate": 1e-06,
      "loss": 0.0228,
      "num_tokens": 163372809.0,
      "reward": 1.037109375,
      "reward_std": 0.3517191410064697,
      "rewards/accuracy_reward/mean": 0.537109375,
      "rewards/accuracy_reward/std": 0.4991086423397064,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3963.0,
      "completions/max_terminated_length": 3963.0,
      "completions/mean_length": 957.732421875,
      "completions/mean_terminated_length": 957.732421875,
      "completions/min_length": 382.0,
      "completions/min_terminated_length": 382.0,
      "epoch": 0.6586051056553072,
      "grad_norm": 0.20039358943002755,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 163926080.0,
      "reward": 0.984375,
      "reward_std": 0.31068554520606995,
      "rewards/accuracy_reward/mean": 0.484375,
      "rewards/accuracy_reward/std": 0.5002445578575134,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5444.0,
      "completions/max_terminated_length": 5444.0,
      "completions/mean_length": 972.892578125,
      "completions/mean_terminated_length": 972.892578125,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 0.6605594234762429,
      "grad_norm": 0.23330319568338742,
      "learning_rate": 1e-06,
      "loss": 0.0158,
      "num_tokens": 164486569.0,
      "reward": 0.900390625,
      "reward_std": 0.3330186903476715,
      "rewards/accuracy_reward/mean": 0.400390625,
      "rewards/accuracy_reward/std": 0.4904567301273346,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2522.0,
      "completions/max_terminated_length": 2522.0,
      "completions/mean_length": 954.451171875,
      "completions/mean_terminated_length": 954.451171875,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.6625137412971784,
      "grad_norm": 0.1880611200124491,
      "learning_rate": 1e-06,
      "loss": 0.0161,
      "num_tokens": 165036656.0,
      "reward": 0.95703125,
      "reward_std": 0.2559651732444763,
      "rewards/accuracy_reward/mean": 0.45703125,
      "rewards/accuracy_reward/std": 0.49863746762275696,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5267.0,
      "completions/max_terminated_length": 5267.0,
      "completions/mean_length": 1018.328125,
      "completions/mean_terminated_length": 1018.328125,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 0.6644680591181141,
      "grad_norm": 0.19915271630621767,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 165619032.0,
      "reward": 0.9423828125,
      "reward_std": 0.36097452044487,
      "rewards/accuracy_reward/mean": 0.443359375,
      "rewards/accuracy_reward/std": 0.49726733565330505,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4165.0,
      "completions/max_terminated_length": 4165.0,
      "completions/mean_length": 906.5546875,
      "completions/mean_terminated_length": 906.5546875,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 0.6664223769390497,
      "grad_norm": 0.20277825440489383,
      "learning_rate": 1e-06,
      "loss": 0.0225,
      "num_tokens": 166130756.0,
      "reward": 1.126953125,
      "reward_std": 0.281166672706604,
      "rewards/accuracy_reward/mean": 0.626953125,
      "rewards/accuracy_reward/std": 0.48408737778663635,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5169.0,
      "completions/max_terminated_length": 5169.0,
      "completions/mean_length": 998.9765625,
      "completions/mean_terminated_length": 998.9765625,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.6683766947599853,
      "grad_norm": 0.1702251710038088,
      "learning_rate": 1e-06,
      "loss": -0.0049,
      "num_tokens": 166696248.0,
      "reward": 0.955078125,
      "reward_std": 0.27338704466819763,
      "rewards/accuracy_reward/mean": 0.455078125,
      "rewards/accuracy_reward/std": 0.4984649419784546,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4518.0,
      "completions/max_terminated_length": 4518.0,
      "completions/mean_length": 1054.990234375,
      "completions/mean_terminated_length": 1054.990234375,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.670331012580921,
      "grad_norm": 0.2023491526993484,
      "learning_rate": 1e-06,
      "loss": 0.0261,
      "num_tokens": 167293539.0,
      "reward": 1.0185546875,
      "reward_std": 0.33125221729278564,
      "rewards/accuracy_reward/mean": 0.521484375,
      "rewards/accuracy_reward/std": 0.5000267624855042,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2903.0,
      "completions/max_terminated_length": 2903.0,
      "completions/mean_length": 922.474609375,
      "completions/mean_terminated_length": 922.474609375,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 0.6722853304018566,
      "grad_norm": 0.21977580067642352,
      "learning_rate": 1e-06,
      "loss": -0.0037,
      "num_tokens": 167827126.0,
      "reward": 1.052734375,
      "reward_std": 0.3389337956905365,
      "rewards/accuracy_reward/mean": 0.552734375,
      "rewards/accuracy_reward/std": 0.4976975917816162,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3251.0,
      "completions/max_terminated_length": 3251.0,
      "completions/mean_length": 1021.240234375,
      "completions/mean_terminated_length": 1021.240234375,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 0.6742396482227923,
      "grad_norm": 0.18814944592818797,
      "learning_rate": 1e-06,
      "loss": -0.0102,
      "num_tokens": 168408193.0,
      "reward": 1.029296875,
      "reward_std": 0.30225419998168945,
      "rewards/accuracy_reward/mean": 0.529296875,
      "rewards/accuracy_reward/std": 0.49962911009788513,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4461.0,
      "completions/max_terminated_length": 4461.0,
      "completions/mean_length": 979.4453125,
      "completions/mean_terminated_length": 979.4453125,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.6761939660437278,
      "grad_norm": 0.20340291514882095,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 168965941.0,
      "reward": 0.9453125,
      "reward_std": 0.3172352910041809,
      "rewards/accuracy_reward/mean": 0.4453125,
      "rewards/accuracy_reward/std": 0.49748632311820984,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2999.0,
      "completions/max_terminated_length": 2999.0,
      "completions/mean_length": 956.697265625,
      "completions/mean_terminated_length": 956.697265625,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.6781482838646635,
      "grad_norm": 0.17474088253640468,
      "learning_rate": 1e-06,
      "loss": 0.0102,
      "num_tokens": 169512314.0,
      "reward": 0.8671875,
      "reward_std": 0.23352685570716858,
      "rewards/accuracy_reward/mean": 0.3671875,
      "rewards/accuracy_reward/std": 0.48250964283943176,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4268.0,
      "completions/max_terminated_length": 4268.0,
      "completions/mean_length": 1066.955078125,
      "completions/mean_terminated_length": 1066.955078125,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 0.6801026016855991,
      "grad_norm": 0.19888757329448092,
      "learning_rate": 1e-06,
      "loss": 0.0088,
      "num_tokens": 170127539.0,
      "reward": 0.888671875,
      "reward_std": 0.24496980011463165,
      "rewards/accuracy_reward/mean": 0.388671875,
      "rewards/accuracy_reward/std": 0.4879252314567566,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7847.0,
      "completions/max_terminated_length": 7847.0,
      "completions/mean_length": 999.2578125,
      "completions/mean_terminated_length": 999.2578125,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.6820569195065348,
      "grad_norm": 0.2395324660694693,
      "learning_rate": 1e-06,
      "loss": 0.0313,
      "num_tokens": 170701399.0,
      "reward": 1.0009765625,
      "reward_std": 0.32716602087020874,
      "rewards/accuracy_reward/mean": 0.501953125,
      "rewards/accuracy_reward/std": 0.5004851818084717,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2335.0,
      "completions/max_terminated_length": 2335.0,
      "completions/mean_length": 929.82421875,
      "completions/mean_terminated_length": 929.82421875,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.6840112373274704,
      "grad_norm": 0.22050010168287415,
      "learning_rate": 1e-06,
      "loss": 0.0223,
      "num_tokens": 171235933.0,
      "reward": 0.99609375,
      "reward_std": 0.32060712575912476,
      "rewards/accuracy_reward/mean": 0.49609375,
      "rewards/accuracy_reward/std": 0.5004737377166748,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5183.0,
      "completions/max_terminated_length": 5183.0,
      "completions/mean_length": 983.298828125,
      "completions/mean_terminated_length": 983.298828125,
      "completions/min_length": 317.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 0.685965555148406,
      "grad_norm": 0.20864405778217568,
      "learning_rate": 1e-06,
      "loss": 0.0142,
      "num_tokens": 171802806.0,
      "reward": 0.8955078125,
      "reward_std": 0.26462340354919434,
      "rewards/accuracy_reward/mean": 0.396484375,
      "rewards/accuracy_reward/std": 0.4896455705165863,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2476.0,
      "completions/max_terminated_length": 2476.0,
      "completions/mean_length": 992.28515625,
      "completions/mean_terminated_length": 992.28515625,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 0.6879198729693417,
      "grad_norm": 0.1939572330378823,
      "learning_rate": 1e-06,
      "loss": 0.0231,
      "num_tokens": 172367336.0,
      "reward": 0.904296875,
      "reward_std": 0.27583566308021545,
      "rewards/accuracy_reward/mean": 0.404296875,
      "rewards/accuracy_reward/std": 0.4912354052066803,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5556.0,
      "completions/max_terminated_length": 5556.0,
      "completions/mean_length": 993.232421875,
      "completions/mean_terminated_length": 993.232421875,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 0.6898741907902772,
      "grad_norm": 0.21178878140983268,
      "learning_rate": 1e-06,
      "loss": 0.0138,
      "num_tokens": 172934575.0,
      "reward": 1.01953125,
      "reward_std": 0.252077579498291,
      "rewards/accuracy_reward/mean": 0.51953125,
      "rewards/accuracy_reward/std": 0.5001069903373718,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 4506.0,
      "completions/max_terminated_length": 4506.0,
      "completions/mean_length": 1025.25,
      "completions/mean_terminated_length": 1027.25634765625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 0.6918285086112129,
      "grad_norm": 0.2220756692952701,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 173516511.0,
      "reward": 0.849609375,
      "reward_std": 0.30644211173057556,
      "rewards/accuracy_reward/mean": 0.3515625,
      "rewards/accuracy_reward/std": 0.4779251217842102,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3937.0,
      "completions/max_terminated_length": 3937.0,
      "completions/mean_length": 973.056640625,
      "completions/mean_terminated_length": 973.056640625,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.6937828264321485,
      "grad_norm": 0.2723223044180947,
      "learning_rate": 1e-06,
      "loss": 0.0331,
      "num_tokens": 174083500.0,
      "reward": 0.9736328125,
      "reward_std": 0.3103483319282532,
      "rewards/accuracy_reward/mean": 0.474609375,
      "rewards/accuracy_reward/std": 0.4998432695865631,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6884.0,
      "completions/max_terminated_length": 6884.0,
      "completions/mean_length": 959.931640625,
      "completions/mean_terminated_length": 959.931640625,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.6957371442530842,
      "grad_norm": 0.2234927815528302,
      "learning_rate": 1e-06,
      "loss": 0.03,
      "num_tokens": 174633833.0,
      "reward": 0.978515625,
      "reward_std": 0.30642932653427124,
      "rewards/accuracy_reward/mean": 0.478515625,
      "rewards/accuracy_reward/std": 0.5000267624855042,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 6013.0,
      "completions/max_terminated_length": 6013.0,
      "completions/mean_length": 939.26171875,
      "completions/mean_terminated_length": 941.0997924804688,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 0.6976914620740198,
      "grad_norm": 0.2809864981267674,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 175178799.0,
      "reward": 0.998046875,
      "reward_std": 0.3423428535461426,
      "rewards/accuracy_reward/mean": 0.498046875,
      "rewards/accuracy_reward/std": 0.5004851818084717,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4325.0,
      "completions/max_terminated_length": 4325.0,
      "completions/mean_length": 1063.2890625,
      "completions/mean_terminated_length": 1063.2890625,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.6996457798949555,
      "grad_norm": 0.19185919129350879,
      "learning_rate": 1e-06,
      "loss": 0.0187,
      "num_tokens": 175778979.0,
      "reward": 0.8388671875,
      "reward_std": 0.2665994167327881,
      "rewards/accuracy_reward/mean": 0.33984375,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3015.0,
      "completions/max_terminated_length": 3015.0,
      "completions/mean_length": 966.79296875,
      "completions/mean_terminated_length": 966.79296875,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 0.701600097715891,
      "grad_norm": 0.21556774284686625,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 176332777.0,
      "reward": 1.037109375,
      "reward_std": 0.2835124731063843,
      "rewards/accuracy_reward/mean": 0.5390625,
      "rewards/accuracy_reward/std": 0.4989593029022217,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 5211.0,
      "completions/max_terminated_length": 5211.0,
      "completions/mean_length": 1076.86328125,
      "completions/mean_terminated_length": 1078.9705810546875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 0.7035544155368266,
      "grad_norm": 0.19706033942634024,
      "learning_rate": 1e-06,
      "loss": 0.0284,
      "num_tokens": 176949043.0,
      "reward": 0.96875,
      "reward_std": 0.31767719984054565,
      "rewards/accuracy_reward/mean": 0.47265625,
      "rewards/accuracy_reward/std": 0.49974003434181213,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6451.0,
      "completions/max_terminated_length": 6451.0,
      "completions/mean_length": 1125.8359375,
      "completions/mean_terminated_length": 1125.8359375,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.7055087333577623,
      "grad_norm": 0.23532020192287942,
      "learning_rate": 1e-06,
      "loss": 0.0277,
      "num_tokens": 177586511.0,
      "reward": 0.931640625,
      "reward_std": 0.341478168964386,
      "rewards/accuracy_reward/mean": 0.435546875,
      "rewards/accuracy_reward/std": 0.49631330370903015,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4267.0,
      "completions/max_terminated_length": 4267.0,
      "completions/mean_length": 1079.51171875,
      "completions/mean_terminated_length": 1079.51171875,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.7074630511786979,
      "grad_norm": 0.23254178529562144,
      "learning_rate": 1e-06,
      "loss": -0.0064,
      "num_tokens": 178195493.0,
      "reward": 1.0556640625,
      "reward_std": 0.40996870398521423,
      "rewards/accuracy_reward/mean": 0.55859375,
      "rewards/accuracy_reward/std": 0.4970405399799347,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4531.0,
      "completions/max_terminated_length": 4531.0,
      "completions/mean_length": 1043.580078125,
      "completions/mean_terminated_length": 1043.580078125,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.7094173689996336,
      "grad_norm": 0.21972027991746884,
      "learning_rate": 1e-06,
      "loss": 0.0108,
      "num_tokens": 178797582.0,
      "reward": 0.794921875,
      "reward_std": 0.23968225717544556,
      "rewards/accuracy_reward/mean": 0.298828125,
      "rewards/accuracy_reward/std": 0.45819199085235596,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4170.0,
      "completions/max_terminated_length": 4170.0,
      "completions/mean_length": 1076.59765625,
      "completions/mean_terminated_length": 1076.59765625,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.7113716868205692,
      "grad_norm": 0.22569527507457063,
      "learning_rate": 1e-06,
      "loss": 0.011,
      "num_tokens": 179408256.0,
      "reward": 0.85546875,
      "reward_std": 0.2965734899044037,
      "rewards/accuracy_reward/mean": 0.357421875,
      "rewards/accuracy_reward/std": 0.4797092080116272,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3941.0,
      "completions/max_terminated_length": 3941.0,
      "completions/mean_length": 979.7578125,
      "completions/mean_terminated_length": 979.7578125,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.7133260046415049,
      "grad_norm": 0.2065068796590369,
      "learning_rate": 1e-06,
      "loss": -0.0057,
      "num_tokens": 179976676.0,
      "reward": 0.8095703125,
      "reward_std": 0.2661935091018677,
      "rewards/accuracy_reward/mean": 0.310546875,
      "rewards/accuracy_reward/std": 0.46317005157470703,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3976.0,
      "completions/max_terminated_length": 3976.0,
      "completions/mean_length": 969.2734375,
      "completions/mean_terminated_length": 969.2734375,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.7152803224624404,
      "grad_norm": 0.20936776707413618,
      "learning_rate": 1e-06,
      "loss": 0.0187,
      "num_tokens": 180530416.0,
      "reward": 1.05078125,
      "reward_std": 0.29084792733192444,
      "rewards/accuracy_reward/mean": 0.55078125,
      "rewards/accuracy_reward/std": 0.497901052236557,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3612.0,
      "completions/max_terminated_length": 3612.0,
      "completions/mean_length": 1004.86328125,
      "completions/mean_terminated_length": 1004.86328125,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.7172346402833761,
      "grad_norm": 0.20583825538387396,
      "learning_rate": 1e-06,
      "loss": 0.0243,
      "num_tokens": 181105850.0,
      "reward": 0.9111328125,
      "reward_std": 0.2520977854728699,
      "rewards/accuracy_reward/mean": 0.412109375,
      "rewards/accuracy_reward/std": 0.49269601702690125,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4012.0,
      "completions/max_terminated_length": 4012.0,
      "completions/mean_length": 1023.056640625,
      "completions/mean_terminated_length": 1023.056640625,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 0.7191889581043117,
      "grad_norm": 0.21930718293078055,
      "learning_rate": 1e-06,
      "loss": 0.0153,
      "num_tokens": 181687975.0,
      "reward": 0.935546875,
      "reward_std": 0.28961291909217834,
      "rewards/accuracy_reward/mean": 0.435546875,
      "rewards/accuracy_reward/std": 0.49631330370903015,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3805.0,
      "completions/max_terminated_length": 3805.0,
      "completions/mean_length": 975.69921875,
      "completions/mean_terminated_length": 975.69921875,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 0.7211432759252473,
      "grad_norm": 0.23388149896309182,
      "learning_rate": 1e-06,
      "loss": -0.0053,
      "num_tokens": 182271981.0,
      "reward": 1.205078125,
      "reward_std": 0.2989095449447632,
      "rewards/accuracy_reward/mean": 0.705078125,
      "rewards/accuracy_reward/std": 0.4564536213874817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2938.0,
      "completions/max_terminated_length": 2938.0,
      "completions/mean_length": 865.365234375,
      "completions/mean_terminated_length": 865.365234375,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.723097593746183,
      "grad_norm": 0.25795839683205546,
      "learning_rate": 1e-06,
      "loss": 0.0125,
      "num_tokens": 182794040.0,
      "reward": 1.1806640625,
      "reward_std": 0.2649618089199066,
      "rewards/accuracy_reward/mean": 0.681640625,
      "rewards/accuracy_reward/std": 0.46629536151885986,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1626.0,
      "completions/max_terminated_length": 1626.0,
      "completions/mean_length": 874.736328125,
      "completions/mean_terminated_length": 874.736328125,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 0.7250519115671186,
      "grad_norm": 0.2619867502554444,
      "learning_rate": 1e-06,
      "loss": 0.0148,
      "num_tokens": 183338577.0,
      "reward": 1.16015625,
      "reward_std": 0.25967979431152344,
      "rewards/accuracy_reward/mean": 0.66015625,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4121.0,
      "completions/max_terminated_length": 4121.0,
      "completions/mean_length": 1094.76953125,
      "completions/mean_terminated_length": 1094.76953125,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.7270062293880543,
      "grad_norm": 0.2576361395485926,
      "learning_rate": 1e-06,
      "loss": 0.0336,
      "num_tokens": 183970283.0,
      "reward": 1.0302734375,
      "reward_std": 0.3692060112953186,
      "rewards/accuracy_reward/mean": 0.53125,
      "rewards/accuracy_reward/std": 0.4995105266571045,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6671.0,
      "completions/max_terminated_length": 6671.0,
      "completions/mean_length": 1063.619140625,
      "completions/mean_terminated_length": 1063.619140625,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 0.7289605472089898,
      "grad_norm": 0.24802752750639284,
      "learning_rate": 1e-06,
      "loss": 0.0158,
      "num_tokens": 184582360.0,
      "reward": 1.0576171875,
      "reward_std": 0.32990533113479614,
      "rewards/accuracy_reward/mean": 0.55859375,
      "rewards/accuracy_reward/std": 0.4970405399799347,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4208.0,
      "completions/max_terminated_length": 4208.0,
      "completions/mean_length": 1165.00390625,
      "completions/mean_terminated_length": 1165.00390625,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 0.7309148650299255,
      "grad_norm": 0.2404454165210168,
      "learning_rate": 1e-06,
      "loss": 0.0074,
      "num_tokens": 185247162.0,
      "reward": 1.0302734375,
      "reward_std": 0.3166176676750183,
      "rewards/accuracy_reward/mean": 0.53125,
      "rewards/accuracy_reward/std": 0.4995105266571045,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4869.0,
      "completions/max_terminated_length": 4869.0,
      "completions/mean_length": 1084.7578125,
      "completions/mean_terminated_length": 1084.7578125,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.7328691828508611,
      "grad_norm": 0.24105245494477076,
      "learning_rate": 1e-06,
      "loss": 0.0162,
      "num_tokens": 185876350.0,
      "reward": 0.9599609375,
      "reward_std": 0.2650347948074341,
      "rewards/accuracy_reward/mean": 0.4609375,
      "rewards/accuracy_reward/std": 0.4989593029022217,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4852.0,
      "completions/max_terminated_length": 4852.0,
      "completions/mean_length": 1198.470703125,
      "completions/mean_terminated_length": 1198.470703125,
      "completions/min_length": 515.0,
      "completions/min_terminated_length": 515.0,
      "epoch": 0.7348235006717968,
      "grad_norm": 0.2459399628221602,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 186561839.0,
      "reward": 0.86328125,
      "reward_std": 0.3282925486564636,
      "rewards/accuracy_reward/mean": 0.36328125,
      "rewards/accuracy_reward/std": 0.4814152419567108,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5050.0,
      "completions/max_terminated_length": 5050.0,
      "completions/mean_length": 1275.294921875,
      "completions/mean_terminated_length": 1275.294921875,
      "completions/min_length": 468.0,
      "completions/min_terminated_length": 468.0,
      "epoch": 0.7367778184927324,
      "grad_norm": 0.2176582509081832,
      "learning_rate": 1e-06,
      "loss": 0.0171,
      "num_tokens": 187293654.0,
      "reward": 0.908203125,
      "reward_std": 0.27721935510635376,
      "rewards/accuracy_reward/mean": 0.408203125,
      "rewards/accuracy_reward/std": 0.49198177456855774,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2405.0,
      "completions/max_terminated_length": 2405.0,
      "completions/mean_length": 1073.748046875,
      "completions/mean_terminated_length": 1073.748046875,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 0.738732136313668,
      "grad_norm": 0.2120312763275498,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 187916085.0,
      "reward": 1.10546875,
      "reward_std": 0.26140832901000977,
      "rewards/accuracy_reward/mean": 0.60546875,
      "rewards/accuracy_reward/std": 0.4892277717590332,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5750.0,
      "completions/max_terminated_length": 5750.0,
      "completions/mean_length": 949.21875,
      "completions/mean_terminated_length": 949.21875,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.7406864541346037,
      "grad_norm": 0.27885106998279063,
      "learning_rate": 1e-06,
      "loss": 0.0067,
      "num_tokens": 188482933.0,
      "reward": 1.1767578125,
      "reward_std": 0.27184367179870605,
      "rewards/accuracy_reward/mean": 0.677734375,
      "rewards/accuracy_reward/std": 0.46780112385749817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4050.0,
      "completions/max_terminated_length": 4050.0,
      "completions/mean_length": 1103.09765625,
      "completions/mean_terminated_length": 1103.09765625,
      "completions/min_length": 433.0,
      "completions/min_terminated_length": 433.0,
      "epoch": 0.7426407719555392,
      "grad_norm": 0.22653791246160732,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 189148743.0,
      "reward": 0.986328125,
      "reward_std": 0.2644691467285156,
      "rewards/accuracy_reward/mean": 0.486328125,
      "rewards/accuracy_reward/std": 0.5003018379211426,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2086.0,
      "completions/max_terminated_length": 2086.0,
      "completions/mean_length": 950.05859375,
      "completions/mean_terminated_length": 950.05859375,
      "completions/min_length": 420.0,
      "completions/min_terminated_length": 420.0,
      "epoch": 0.7445950897764749,
      "grad_norm": 0.26884527182128715,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 189723333.0,
      "reward": 0.966796875,
      "reward_std": 0.2909739017486572,
      "rewards/accuracy_reward/mean": 0.466796875,
      "rewards/accuracy_reward/std": 0.4993842542171478,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3257.0,
      "completions/max_terminated_length": 3257.0,
      "completions/mean_length": 1073.556640625,
      "completions/mean_terminated_length": 1073.556640625,
      "completions/min_length": 384.0,
      "completions/min_terminated_length": 384.0,
      "epoch": 0.7465494075974105,
      "grad_norm": 0.24243344727319555,
      "learning_rate": 1e-06,
      "loss": 0.0081,
      "num_tokens": 190352690.0,
      "reward": 1.126953125,
      "reward_std": 0.3259270489215851,
      "rewards/accuracy_reward/mean": 0.626953125,
      "rewards/accuracy_reward/std": 0.48408737778663635,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4828.0,
      "completions/max_terminated_length": 4828.0,
      "completions/mean_length": 973.80859375,
      "completions/mean_terminated_length": 973.80859375,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 0.7485037254183462,
      "grad_norm": 0.2818313890492895,
      "learning_rate": 1e-06,
      "loss": 0.0087,
      "num_tokens": 190934688.0,
      "reward": 1.1796875,
      "reward_std": 0.2989754378795624,
      "rewards/accuracy_reward/mean": 0.6796875,
      "rewards/accuracy_reward/std": 0.4670529365539551,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2797.0,
      "completions/max_terminated_length": 2797.0,
      "completions/mean_length": 922.3984375,
      "completions/mean_terminated_length": 922.3984375,
      "completions/min_length": 428.0,
      "completions/min_terminated_length": 428.0,
      "epoch": 0.7504580432392818,
      "grad_norm": 0.2664778743355963,
      "learning_rate": 1e-06,
      "loss": 0.013,
      "num_tokens": 191461164.0,
      "reward": 1.07421875,
      "reward_std": 0.3264962434768677,
      "rewards/accuracy_reward/mean": 0.57421875,
      "rewards/accuracy_reward/std": 0.4949444830417633,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4507.0,
      "completions/max_terminated_length": 4507.0,
      "completions/mean_length": 1047.314453125,
      "completions/mean_terminated_length": 1047.314453125,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.7524123610602175,
      "grad_norm": 0.22596544703523644,
      "learning_rate": 1e-06,
      "loss": 0.0193,
      "num_tokens": 192073085.0,
      "reward": 1.078125,
      "reward_std": 0.29244738817214966,
      "rewards/accuracy_reward/mean": 0.578125,
      "rewards/accuracy_reward/std": 0.49434176087379456,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4225.0,
      "completions/max_terminated_length": 4225.0,
      "completions/mean_length": 846.6328125,
      "completions/mean_terminated_length": 846.6328125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.754366678881153,
      "grad_norm": 0.2680276869113691,
      "learning_rate": 1e-06,
      "loss": 0.0095,
      "num_tokens": 192578433.0,
      "reward": 1.228515625,
      "reward_std": 0.2956398129463196,
      "rewards/accuracy_reward/mean": 0.728515625,
      "rewards/accuracy_reward/std": 0.44516023993492126,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2814.0,
      "completions/max_terminated_length": 2814.0,
      "completions/mean_length": 939.390625,
      "completions/mean_terminated_length": 939.390625,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.7563209967020886,
      "grad_norm": 0.25647382882660336,
      "learning_rate": 1e-06,
      "loss": -0.0063,
      "num_tokens": 193128841.0,
      "reward": 1.25,
      "reward_std": 0.26654744148254395,
      "rewards/accuracy_reward/mean": 0.75,
      "rewards/accuracy_reward/std": 0.43343618512153625,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2576.0,
      "completions/max_terminated_length": 2576.0,
      "completions/mean_length": 860.400390625,
      "completions/mean_terminated_length": 860.400390625,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.7582753145230243,
      "grad_norm": 0.3148442209674101,
      "learning_rate": 1e-06,
      "loss": 0.0093,
      "num_tokens": 193634582.0,
      "reward": 1.1591796875,
      "reward_std": 0.3296191096305847,
      "rewards/accuracy_reward/mean": 0.66015625,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2325.0,
      "completions/max_terminated_length": 2325.0,
      "completions/mean_length": 948.8125,
      "completions/mean_terminated_length": 948.8125,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.7602296323439599,
      "grad_norm": 0.2574471346650531,
      "learning_rate": 1e-06,
      "loss": 0.0092,
      "num_tokens": 194198406.0,
      "reward": 1.169921875,
      "reward_std": 0.23618678748607635,
      "rewards/accuracy_reward/mean": 0.669921875,
      "rewards/accuracy_reward/std": 0.47070086002349854,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4683.0,
      "completions/max_terminated_length": 4683.0,
      "completions/mean_length": 1055.330078125,
      "completions/mean_terminated_length": 1055.330078125,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 0.7621839501648956,
      "grad_norm": 0.19428781730315361,
      "learning_rate": 1e-06,
      "loss": 0.0024,
      "num_tokens": 194795391.0,
      "reward": 1.1376953125,
      "reward_std": 0.21126717329025269,
      "rewards/accuracy_reward/mean": 0.638671875,
      "rewards/accuracy_reward/std": 0.48085519671440125,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4069.0,
      "completions/max_terminated_length": 4069.0,
      "completions/mean_length": 995.46484375,
      "completions/mean_terminated_length": 995.46484375,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.7641382679858312,
      "grad_norm": 0.2581499589268977,
      "learning_rate": 1e-06,
      "loss": 0.0187,
      "num_tokens": 195367341.0,
      "reward": 1.0654296875,
      "reward_std": 0.29932349920272827,
      "rewards/accuracy_reward/mean": 0.56640625,
      "rewards/accuracy_reward/std": 0.4960552453994751,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2046.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 862.248046875,
      "completions/mean_terminated_length": 862.248046875,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.7660925858067669,
      "grad_norm": 0.2818869309096585,
      "learning_rate": 1e-06,
      "loss": 0.0158,
      "num_tokens": 195868380.0,
      "reward": 1.19140625,
      "reward_std": 0.27888891100883484,
      "rewards/accuracy_reward/mean": 0.69140625,
      "rewards/accuracy_reward/std": 0.4623647928237915,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4574.0,
      "completions/max_terminated_length": 4574.0,
      "completions/mean_length": 916.607421875,
      "completions/mean_terminated_length": 916.607421875,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 0.7680469036277024,
      "grad_norm": 0.2642600120130633,
      "learning_rate": 1e-06,
      "loss": 0.0311,
      "num_tokens": 196392227.0,
      "reward": 1.1513671875,
      "reward_std": 0.2804816961288452,
      "rewards/accuracy_reward/mean": 0.65234375,
      "rewards/accuracy_reward/std": 0.47669193148612976,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 7538.0,
      "completions/max_terminated_length": 7538.0,
      "completions/mean_length": 1020.80859375,
      "completions/mean_terminated_length": 1022.8062744140625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.7700012214486381,
      "grad_norm": 0.26642048952222164,
      "learning_rate": 1e-06,
      "loss": 0.0055,
      "num_tokens": 196981505.0,
      "reward": 1.0458984375,
      "reward_std": 0.2517250180244446,
      "rewards/accuracy_reward/mean": 0.546875,
      "rewards/accuracy_reward/std": 0.4982847273349762,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4545.0,
      "completions/max_terminated_length": 4545.0,
      "completions/mean_length": 1059.181640625,
      "completions/mean_terminated_length": 1059.181640625,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.7719555392695737,
      "grad_norm": 0.2531640916757571,
      "learning_rate": 1e-06,
      "loss": -0.0095,
      "num_tokens": 197587022.0,
      "reward": 0.939453125,
      "reward_std": 0.25433334708213806,
      "rewards/accuracy_reward/mean": 0.439453125,
      "rewards/accuracy_reward/std": 0.49680593609809875,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4299.0,
      "completions/max_terminated_length": 4299.0,
      "completions/mean_length": 1077.2265625,
      "completions/mean_terminated_length": 1077.2265625,
      "completions/min_length": 471.0,
      "completions/min_terminated_length": 471.0,
      "epoch": 0.7739098570905093,
      "grad_norm": 0.27930445296157,
      "learning_rate": 1e-06,
      "loss": 0.0133,
      "num_tokens": 198201346.0,
      "reward": 1.0126953125,
      "reward_std": 0.28195977210998535,
      "rewards/accuracy_reward/mean": 0.513671875,
      "rewards/accuracy_reward/std": 0.5003018379211426,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4401.0,
      "completions/max_terminated_length": 4401.0,
      "completions/mean_length": 1085.13671875,
      "completions/mean_terminated_length": 1085.13671875,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 0.775864174911445,
      "grad_norm": 0.25743249589936545,
      "learning_rate": 1e-06,
      "loss": 0.0151,
      "num_tokens": 198812424.0,
      "reward": 1.041015625,
      "reward_std": 0.326663613319397,
      "rewards/accuracy_reward/mean": 0.541015625,
      "rewards/accuracy_reward/std": 0.49880221486091614,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6222.0,
      "completions/max_terminated_length": 6222.0,
      "completions/mean_length": 1120.265625,
      "completions/mean_terminated_length": 1120.265625,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 0.7778184927323806,
      "grad_norm": 0.2503161354384062,
      "learning_rate": 1e-06,
      "loss": 0.0191,
      "num_tokens": 199453456.0,
      "reward": 0.9072265625,
      "reward_std": 0.29364052414894104,
      "rewards/accuracy_reward/mean": 0.408203125,
      "rewards/accuracy_reward/std": 0.49198177456855774,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3056.0,
      "completions/max_terminated_length": 3056.0,
      "completions/mean_length": 971.03515625,
      "completions/mean_terminated_length": 971.03515625,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 0.7797728105533163,
      "grad_norm": 0.27548667048132863,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 200014434.0,
      "reward": 0.982421875,
      "reward_std": 0.18923774361610413,
      "rewards/accuracy_reward/mean": 0.482421875,
      "rewards/accuracy_reward/std": 0.5001795887947083,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3749.0,
      "completions/max_terminated_length": 3749.0,
      "completions/mean_length": 1073.80859375,
      "completions/mean_terminated_length": 1073.80859375,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.7817271283742518,
      "grad_norm": 0.3086954039100987,
      "learning_rate": 1e-06,
      "loss": 0.0105,
      "num_tokens": 200624272.0,
      "reward": 1.0283203125,
      "reward_std": 0.3033173680305481,
      "rewards/accuracy_reward/mean": 0.53125,
      "rewards/accuracy_reward/std": 0.4995105266571045,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4564.0,
      "completions/max_terminated_length": 4564.0,
      "completions/mean_length": 1044.466796875,
      "completions/mean_terminated_length": 1044.466796875,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 0.7836814461951875,
      "grad_norm": 0.2684355677148113,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 201222863.0,
      "reward": 0.962890625,
      "reward_std": 0.28903210163116455,
      "rewards/accuracy_reward/mean": 0.462890625,
      "rewards/accuracy_reward/std": 0.4991086423397064,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2505.0,
      "completions/max_terminated_length": 2505.0,
      "completions/mean_length": 942.828125,
      "completions/mean_terminated_length": 942.828125,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.7856357640161231,
      "grad_norm": 0.28069683172528026,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 201762471.0,
      "reward": 1.109375,
      "reward_std": 0.2700616121292114,
      "rewards/accuracy_reward/mean": 0.609375,
      "rewards/accuracy_reward/std": 0.48836761713027954,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5010.0,
      "completions/max_terminated_length": 5010.0,
      "completions/mean_length": 728.0390625,
      "completions/mean_terminated_length": 728.0390625,
      "completions/min_length": 314.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 0.7875900818370587,
      "grad_norm": 0.3505792077876118,
      "learning_rate": 1e-06,
      "loss": 0.0132,
      "num_tokens": 202198155.0,
      "reward": 1.205078125,
      "reward_std": 0.273173987865448,
      "rewards/accuracy_reward/mean": 0.705078125,
      "rewards/accuracy_reward/std": 0.4564536213874817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3155.0,
      "completions/max_terminated_length": 3155.0,
      "completions/mean_length": 831.5546875,
      "completions/mean_terminated_length": 831.5546875,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.7895443996579944,
      "grad_norm": 0.35387225051476173,
      "learning_rate": 1e-06,
      "loss": 0.0153,
      "num_tokens": 202693159.0,
      "reward": 1.208984375,
      "reward_std": 0.26959553360939026,
      "rewards/accuracy_reward/mean": 0.708984375,
      "rewards/accuracy_reward/std": 0.45467492938041687,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 7718.0,
      "completions/max_terminated_length": 7718.0,
      "completions/mean_length": 822.166015625,
      "completions/mean_terminated_length": 823.7749633789062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.79149871747893,
      "grad_norm": 0.3237887263193711,
      "learning_rate": 1e-06,
      "loss": -0.0075,
      "num_tokens": 203175356.0,
      "reward": 1.2001953125,
      "reward_std": 0.24427789449691772,
      "rewards/accuracy_reward/mean": 0.701171875,
      "rewards/accuracy_reward/std": 0.45819199085235596,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5268.0,
      "completions/max_terminated_length": 5268.0,
      "completions/mean_length": 857.345703125,
      "completions/mean_terminated_length": 857.345703125,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.7934530352998657,
      "grad_norm": 0.29168011380998693,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 203676685.0,
      "reward": 1.28125,
      "reward_std": 0.23003166913986206,
      "rewards/accuracy_reward/mean": 0.78125,
      "rewards/accuracy_reward/std": 0.41380295157432556,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3649.0,
      "completions/max_terminated_length": 3649.0,
      "completions/mean_length": 811.177734375,
      "completions/mean_terminated_length": 811.177734375,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.7954073531208012,
      "grad_norm": 0.376105611547192,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 204168392.0,
      "reward": 1.134765625,
      "reward_std": 0.26618099212646484,
      "rewards/accuracy_reward/mean": 0.634765625,
      "rewards/accuracy_reward/std": 0.4819667339324951,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5714.0,
      "completions/max_terminated_length": 5714.0,
      "completions/mean_length": 869.43359375,
      "completions/mean_terminated_length": 869.43359375,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.7973616709417369,
      "grad_norm": 0.28847549854200055,
      "learning_rate": 1e-06,
      "loss": 0.0077,
      "num_tokens": 204673574.0,
      "reward": 1.2021484375,
      "reward_std": 0.2509317994117737,
      "rewards/accuracy_reward/mean": 0.703125,
      "rewards/accuracy_reward/std": 0.45732781291007996,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.998046875,
      "rewards/soft_format_reward/std": 0.04419417306780815,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5213.0,
      "completions/max_terminated_length": 5213.0,
      "completions/mean_length": 868.015625,
      "completions/mean_terminated_length": 868.015625,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.7993159887626725,
      "grad_norm": 0.319636585629475,
      "learning_rate": 1e-06,
      "loss": -0.0028,
      "num_tokens": 205178894.0,
      "reward": 1.171875,
      "reward_std": 0.29458165168762207,
      "rewards/accuracy_reward/mean": 0.671875,
      "rewards/accuracy_reward/std": 0.4699897766113281,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2088.0,
      "completions/max_terminated_length": 2088.0,
      "completions/mean_length": 811.818359375,
      "completions/mean_terminated_length": 811.818359375,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.8012703065836082,
      "grad_norm": 0.35531193872391,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 205650817.0,
      "reward": 1.212890625,
      "reward_std": 0.2410779893398285,
      "rewards/accuracy_reward/mean": 0.712890625,
      "rewards/accuracy_reward/std": 0.45285552740097046,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5580.0,
      "completions/max_terminated_length": 5580.0,
      "completions/mean_length": 783.1484375,
      "completions/mean_terminated_length": 783.1484375,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.8032246244045438,
      "grad_norm": 0.3643183188974139,
      "learning_rate": 1e-06,
      "loss": 0.0153,
      "num_tokens": 206109965.0,
      "reward": 1.328125,
      "reward_std": 0.18987837433815002,
      "rewards/accuracy_reward/mean": 0.828125,
      "rewards/accuracy_reward/std": 0.3776407241821289,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4124.0,
      "completions/max_terminated_length": 4124.0,
      "completions/mean_length": 795.90234375,
      "completions/mean_terminated_length": 795.90234375,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.8051789422254794,
      "grad_norm": 0.3573674837969747,
      "learning_rate": 1e-06,
      "loss": 0.0118,
      "num_tokens": 206575931.0,
      "reward": 1.220703125,
      "reward_std": 0.2888934016227722,
      "rewards/accuracy_reward/mean": 0.720703125,
      "rewards/accuracy_reward/std": 0.44909247756004333,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2433.0,
      "completions/max_terminated_length": 2433.0,
      "completions/mean_length": 766.810546875,
      "completions/mean_terminated_length": 766.810546875,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.8071332600464151,
      "grad_norm": 0.3521201101619864,
      "learning_rate": 1e-06,
      "loss": -0.0048,
      "num_tokens": 207032570.0,
      "reward": 1.234375,
      "reward_std": 0.2204812467098236,
      "rewards/accuracy_reward/mean": 0.734375,
      "rewards/accuracy_reward/std": 0.44209739565849304,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7929.0,
      "completions/max_terminated_length": 7929.0,
      "completions/mean_length": 909.228515625,
      "completions/mean_terminated_length": 909.228515625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 0.8090875778673506,
      "grad_norm": 0.35595265396748055,
      "learning_rate": 1e-06,
      "loss": 0.0166,
      "num_tokens": 207576111.0,
      "reward": 1.12109375,
      "reward_std": 0.2677309215068817,
      "rewards/accuracy_reward/mean": 0.62109375,
      "rewards/accuracy_reward/std": 0.4855891764163971,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5867.0,
      "completions/max_terminated_length": 5867.0,
      "completions/mean_length": 842.884765625,
      "completions/mean_terminated_length": 842.884765625,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.8110418956882863,
      "grad_norm": 0.2703294250943154,
      "learning_rate": 1e-06,
      "loss": 0.0035,
      "num_tokens": 208069988.0,
      "reward": 1.21484375,
      "reward_std": 0.22637133300304413,
      "rewards/accuracy_reward/mean": 0.71484375,
      "rewards/accuracy_reward/std": 0.45193037390708923,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0078125,
      "completions/max_length": 5990.0,
      "completions/max_terminated_length": 5990.0,
      "completions/mean_length": 843.8984375,
      "completions/mean_terminated_length": 850.5432739257812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.8129962135092219,
      "grad_norm": 0.3370587897049859,
      "learning_rate": 1e-06,
      "loss": -0.0377,
      "num_tokens": 208567328.0,
      "reward": 1.060546875,
      "reward_std": 0.26288777589797974,
      "rewards/accuracy_reward/mean": 0.564453125,
      "rewards/accuracy_reward/std": 0.49631330370903015,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.005859375,
      "completions/max_length": 6346.0,
      "completions/max_terminated_length": 6346.0,
      "completions/mean_length": 783.865234375,
      "completions/mean_terminated_length": 788.4852905273438,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 0.8149505313301576,
      "grad_norm": 0.39665042850965304,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 209044107.0,
      "reward": 1.201171875,
      "reward_std": 0.29087740182876587,
      "rewards/accuracy_reward/mean": 0.705078125,
      "rewards/accuracy_reward/std": 0.4564536213874817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 2263.0,
      "completions/max_terminated_length": 2263.0,
      "completions/mean_length": 724.42578125,
      "completions/mean_terminated_length": 727.2667236328125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 0.8169048491510932,
      "grad_norm": 0.42912039696135573,
      "learning_rate": 1e-06,
      "loss": -0.0041,
      "num_tokens": 209476085.0,
      "reward": 1.2353515625,
      "reward_std": 0.31185171008110046,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.44000017642974854,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 2207.0,
      "completions/max_terminated_length": 2207.0,
      "completions/mean_length": 783.4921875,
      "completions/mean_terminated_length": 785.0254516601562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.8188591669720289,
      "grad_norm": 0.3416888766786459,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 209933121.0,
      "reward": 1.1796875,
      "reward_std": 0.24416999518871307,
      "rewards/accuracy_reward/mean": 0.681640625,
      "rewards/accuracy_reward/std": 0.46629536151885986,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01171875,
      "completions/max_length": 7035.0,
      "completions/max_terminated_length": 7035.0,
      "completions/mean_length": 893.9609375,
      "completions/mean_terminated_length": 904.561279296875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.8208134847929645,
      "grad_norm": 0.33763547934277416,
      "learning_rate": 1e-06,
      "loss": -0.0056,
      "num_tokens": 210451181.0,
      "reward": 1.1533203125,
      "reward_std": 0.28630441427230835,
      "rewards/accuracy_reward/mean": 0.66015625,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.986328125,
      "rewards/soft_format_reward/std": 0.1162383034825325,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0078125,
      "completions/max_length": 7300.0,
      "completions/max_terminated_length": 7300.0,
      "completions/mean_length": 834.74609375,
      "completions/mean_terminated_length": 841.3189086914062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.8227678026139,
      "grad_norm": 0.33386172323003377,
      "learning_rate": 1e-06,
      "loss": 0.0102,
      "num_tokens": 210963211.0,
      "reward": 1.1533203125,
      "reward_std": 0.2574579417705536,
      "rewards/accuracy_reward/mean": 0.66015625,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.986328125,
      "rewards/soft_format_reward/std": 0.1162383034825325,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.005859375,
      "completions/max_length": 7065.0,
      "completions/max_terminated_length": 7065.0,
      "completions/mean_length": 814.94921875,
      "completions/mean_terminated_length": 819.7525024414062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.8247221204348357,
      "grad_norm": 0.3652594693962747,
      "learning_rate": 1e-06,
      "loss": 0.0058,
      "num_tokens": 211454097.0,
      "reward": 1.1005859375,
      "reward_std": 0.2645086646080017,
      "rewards/accuracy_reward/mean": 0.60546875,
      "rewards/accuracy_reward/std": 0.4892277717590332,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 7479.0,
      "completions/max_terminated_length": 7479.0,
      "completions/mean_length": 795.013671875,
      "completions/mean_terminated_length": 796.5694580078125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.8266764382557713,
      "grad_norm": 0.3293846810079901,
      "learning_rate": 1e-06,
      "loss": 0.0145,
      "num_tokens": 211930280.0,
      "reward": 1.1337890625,
      "reward_std": 0.26773542165756226,
      "rewards/accuracy_reward/mean": 0.63671875,
      "rewards/accuracy_reward/std": 0.4814152419567108,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 4412.0,
      "completions/max_terminated_length": 4412.0,
      "completions/mean_length": 874.89453125,
      "completions/mean_terminated_length": 883.522705078125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 0.828630756076707,
      "grad_norm": 0.3535089662836341,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 212440290.0,
      "reward": 1.13671875,
      "reward_std": 0.29048609733581543,
      "rewards/accuracy_reward/mean": 0.64453125,
      "rewards/accuracy_reward/std": 0.47912323474884033,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.984375,
      "rewards/soft_format_reward/std": 0.12414088100194931,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 6280.0,
      "completions/max_terminated_length": 6280.0,
      "completions/mean_length": 893.9921875,
      "completions/mean_terminated_length": 897.4981079101562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.8305850738976426,
      "grad_norm": 0.3185673827005288,
      "learning_rate": 1e-06,
      "loss": 0.0033,
      "num_tokens": 212974542.0,
      "reward": 1.125,
      "reward_std": 0.28310471773147583,
      "rewards/accuracy_reward/mean": 0.626953125,
      "rewards/accuracy_reward/std": 0.48408737778663635,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 4507.0,
      "completions/max_terminated_length": 4507.0,
      "completions/mean_length": 900.79296875,
      "completions/mean_terminated_length": 904.3255615234375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.8325393917185783,
      "grad_norm": 0.3267191408879667,
      "learning_rate": 1e-06,
      "loss": 0.0007,
      "num_tokens": 213512532.0,
      "reward": 1.0380859375,
      "reward_std": 0.26320767402648926,
      "rewards/accuracy_reward/mean": 0.541015625,
      "rewards/accuracy_reward/std": 0.49880221486091614,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 3350.0,
      "completions/max_terminated_length": 3350.0,
      "completions/mean_length": 849.65625,
      "completions/mean_terminated_length": 851.3189697265625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.8344937095395139,
      "grad_norm": 0.3430058021350143,
      "learning_rate": 1e-06,
      "loss": 0.0141,
      "num_tokens": 214019156.0,
      "reward": 1.13671875,
      "reward_std": 0.2617988884449005,
      "rewards/accuracy_reward/mean": 0.640625,
      "rewards/accuracy_reward/std": 0.48028653860092163,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 8024.0,
      "completions/max_terminated_length": 8024.0,
      "completions/mean_length": 908.203125,
      "completions/mean_terminated_length": 911.7647705078125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.8364480273604495,
      "grad_norm": 0.3490585175056411,
      "learning_rate": 1e-06,
      "loss": 0.008,
      "num_tokens": 214545612.0,
      "reward": 1.1923828125,
      "reward_std": 0.30423495173454285,
      "rewards/accuracy_reward/mean": 0.6953125,
      "rewards/accuracy_reward/std": 0.4607250988483429,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 2856.0,
      "completions/max_terminated_length": 2856.0,
      "completions/mean_length": 773.9296875,
      "completions/mean_terminated_length": 775.4442138671875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.8384023451813851,
      "grad_norm": 0.3545128571512148,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 215040664.0,
      "reward": 1.0986328125,
      "reward_std": 0.29531624913215637,
      "rewards/accuracy_reward/mean": 0.6015625,
      "rewards/accuracy_reward/std": 0.4900552034378052,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.994140625,
      "rewards/soft_format_reward/std": 0.07639661431312561,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 5426.0,
      "completions/max_terminated_length": 5426.0,
      "completions/mean_length": 816.771484375,
      "completions/mean_terminated_length": 819.9745483398438,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.8403566630023207,
      "grad_norm": 0.2718305462169434,
      "learning_rate": 1e-06,
      "loss": 0.0078,
      "num_tokens": 215523379.0,
      "reward": 1.166015625,
      "reward_std": 0.22507482767105103,
      "rewards/accuracy_reward/mean": 0.669921875,
      "rewards/accuracy_reward/std": 0.47070086002349854,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0078125,
      "completions/max_length": 1784.0,
      "completions/max_terminated_length": 1784.0,
      "completions/mean_length": 841.55078125,
      "completions/mean_terminated_length": 848.1771850585938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.8423109808232564,
      "grad_norm": 0.2822841226319474,
      "learning_rate": 1e-06,
      "loss": -0.0183,
      "num_tokens": 216030333.0,
      "reward": 1.197265625,
      "reward_std": 0.25299516320228577,
      "rewards/accuracy_reward/mean": 0.701171875,
      "rewards/accuracy_reward/std": 0.45819199085235596,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3192.0,
      "completions/max_terminated_length": 3192.0,
      "completions/mean_length": 806.685546875,
      "completions/mean_terminated_length": 806.685546875,
      "completions/min_length": 290.0,
      "completions/min_terminated_length": 290.0,
      "epoch": 0.844265298644192,
      "grad_norm": 0.29442535204726733,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 216511276.0,
      "reward": 1.11328125,
      "reward_std": 0.2568894326686859,
      "rewards/accuracy_reward/mean": 0.61328125,
      "rewards/accuracy_reward/std": 0.48747459053993225,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 1.0,
      "rewards/soft_format_reward/std": 0.0,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 7614.0,
      "completions/max_terminated_length": 7614.0,
      "completions/mean_length": 915.326171875,
      "completions/mean_terminated_length": 917.117431640625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0.8462196164651277,
      "grad_norm": 0.3246937572208824,
      "learning_rate": 1e-06,
      "loss": -0.0129,
      "num_tokens": 217038419.0,
      "reward": 1.072265625,
      "reward_std": 0.29967623949050903,
      "rewards/accuracy_reward/mean": 0.576171875,
      "rewards/accuracy_reward/std": 0.4946470856666565,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9921875,
      "rewards/soft_format_reward/std": 0.08812850713729858,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4563.0,
      "completions/max_terminated_length": 4563.0,
      "completions/mean_length": 879.8125,
      "completions/mean_terminated_length": 879.8125,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.8481739342860632,
      "grad_norm": 0.297249871635069,
      "learning_rate": 1e-06,
      "loss": 0.0092,
      "num_tokens": 217553843.0,
      "reward": 1.251953125,
      "reward_std": 0.2540125250816345,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.4311550557613373,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01171875,
      "completions/max_length": 5545.0,
      "completions/max_terminated_length": 5545.0,
      "completions/mean_length": 939.572265625,
      "completions/mean_terminated_length": 950.7135009765625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.8501282521069989,
      "grad_norm": 0.3374361113595728,
      "learning_rate": 1e-06,
      "loss": -0.0255,
      "num_tokens": 218098152.0,
      "reward": 1.064453125,
      "reward_std": 0.34374499320983887,
      "rewards/accuracy_reward/mean": 0.57421875,
      "rewards/accuracy_reward/std": 0.4949444830417633,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 6207.0,
      "completions/max_terminated_length": 6207.0,
      "completions/mean_length": 837.673828125,
      "completions/mean_terminated_length": 839.3131103515625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.8520825699279345,
      "grad_norm": 0.32954076563755397,
      "learning_rate": 1e-06,
      "loss": 0.0125,
      "num_tokens": 218603585.0,
      "reward": 1.09375,
      "reward_std": 0.24614398181438446,
      "rewards/accuracy_reward/mean": 0.6015625,
      "rewards/accuracy_reward/std": 0.4900552034378052,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.984375,
      "rewards/soft_format_reward/std": 0.12414088100194931,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001953125,
      "completions/max_length": 1599.0,
      "completions/max_terminated_length": 1599.0,
      "completions/mean_length": 749.869140625,
      "completions/mean_terminated_length": 751.3366088867188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 0.8540368877488702,
      "grad_norm": 0.299203277143821,
      "learning_rate": 1e-06,
      "loss": -0.0143,
      "num_tokens": 219059486.0,
      "reward": 1.126953125,
      "reward_std": 0.23192954063415527,
      "rewards/accuracy_reward/mean": 0.62890625,
      "rewards/accuracy_reward/std": 0.4835699498653412,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.99609375,
      "rewards/soft_format_reward/std": 0.06243881583213806,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 5557.0,
      "completions/max_terminated_length": 5557.0,
      "completions/mean_length": 937.2109375,
      "completions/mean_terminated_length": 940.8863525390625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.8559912055698058,
      "grad_norm": 0.302322572675226,
      "learning_rate": 1e-06,
      "loss": 0.0055,
      "num_tokens": 219601418.0,
      "reward": 1.2060546875,
      "reward_std": 0.2617872357368469,
      "rewards/accuracy_reward/mean": 0.7109375,
      "rewards/accuracy_reward/std": 0.45377036929130554,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 6134.0,
      "completions/max_terminated_length": 6134.0,
      "completions/mean_length": 976.826171875,
      "completions/mean_terminated_length": 998.2734375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 349.0,
      "epoch": 0.8579455233907414,
      "grad_norm": 0.3719115885092292,
      "learning_rate": 1e-06,
      "loss": -0.0254,
      "num_tokens": 220170401.0,
      "reward": 0.9501953125,
      "reward_std": 0.37683263421058655,
      "rewards/accuracy_reward/mean": 0.46875,
      "rewards/accuracy_reward/std": 0.4995105266571045,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.962890625,
      "rewards/soft_format_reward/std": 0.18921469151973724,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 7540.0,
      "completions/max_terminated_length": 7540.0,
      "completions/mean_length": 1117.2578125,
      "completions/mean_terminated_length": 1121.6392822265625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 560.0,
      "epoch": 0.8598998412116771,
      "grad_norm": 0.38724736233093465,
      "learning_rate": 1e-06,
      "loss": -0.0202,
      "num_tokens": 220813669.0,
      "reward": 0.8701171875,
      "reward_std": 0.3525937497615814,
      "rewards/accuracy_reward/mean": 0.384765625,
      "rewards/accuracy_reward/std": 0.4870156943798065,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.970703125,
      "rewards/soft_format_reward/std": 0.16880230605602264,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00390625,
      "completions/max_length": 2727.0,
      "completions/max_terminated_length": 2727.0,
      "completions/mean_length": 1055.904296875,
      "completions/mean_terminated_length": 1060.045166015625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 0.8618541590326126,
      "grad_norm": 0.5770145242402152,
      "learning_rate": 1e-06,
      "loss": -0.0132,
      "num_tokens": 221425540.0,
      "reward": 0.9208984375,
      "reward_std": 0.33073461055755615,
      "rewards/accuracy_reward/mean": 0.443359375,
      "rewards/accuracy_reward/std": 0.49726733565330505,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.955078125,
      "rewards/soft_format_reward/std": 0.20733514428138733,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0078125,
      "completions/max_length": 3204.0,
      "completions/max_terminated_length": 3204.0,
      "completions/mean_length": 1063.28125,
      "completions/mean_terminated_length": 1071.653564453125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.8638084768535483,
      "grad_norm": 0.632409719274583,
      "learning_rate": 1e-06,
      "loss": -0.0203,
      "num_tokens": 222045060.0,
      "reward": 0.8525390625,
      "reward_std": 0.35422688722610474,
      "rewards/accuracy_reward/mean": 0.392578125,
      "rewards/accuracy_reward/std": 0.4888018071651459,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.919921875,
      "rewards/soft_format_reward/std": 0.271679550409317,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013671875,
      "completions/max_length": 6647.0,
      "completions/max_terminated_length": 6647.0,
      "completions/mean_length": 953.896484375,
      "completions/mean_terminated_length": 967.1188354492188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.8657627946744839,
      "grad_norm": 0.5184286136801499,
      "learning_rate": 1e-06,
      "loss": -0.0515,
      "num_tokens": 222602047.0,
      "reward": 0.8583984375,
      "reward_std": 0.3725011944770813,
      "rewards/accuracy_reward/mean": 0.416015625,
      "rewards/accuracy_reward/std": 0.493378221988678,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.884765625,
      "rewards/soft_format_reward/std": 0.3196168541908264,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029296875,
      "completions/max_length": 5605.0,
      "completions/max_terminated_length": 5605.0,
      "completions/mean_length": 910.240234375,
      "completions/mean_terminated_length": 937.7122192382812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.8677171124954196,
      "grad_norm": 1.1610365297434613,
      "learning_rate": 1e-06,
      "loss": -0.076,
      "num_tokens": 223138042.0,
      "reward": 0.755859375,
      "reward_std": 0.4233691096305847,
      "rewards/accuracy_reward/mean": 0.328125,
      "rewards/accuracy_reward/std": 0.4699897766113281,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.85546875,
      "rewards/soft_format_reward/std": 0.35197147727012634,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05078125,
      "completions/max_length": 7396.0,
      "completions/max_terminated_length": 7396.0,
      "completions/mean_length": 834.078125,
      "completions/mean_terminated_length": 878.6995849609375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.8696714303163552,
      "grad_norm": 1.228348619301139,
      "learning_rate": 1e-06,
      "loss": -0.1206,
      "num_tokens": 223628466.0,
      "reward": 0.6611328125,
      "reward_std": 0.4097694754600525,
      "rewards/accuracy_reward/mean": 0.271484375,
      "rewards/accuracy_reward/std": 0.44516023993492126,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.779296875,
      "rewards/soft_format_reward/std": 0.4151262938976288,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05078125,
      "completions/max_length": 5346.0,
      "completions/max_terminated_length": 5346.0,
      "completions/mean_length": 820.140625,
      "completions/mean_terminated_length": 864.0164184570312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.8716257481372909,
      "grad_norm": 0.9524625747029093,
      "learning_rate": 1e-06,
      "loss": -0.1072,
      "num_tokens": 224117290.0,
      "reward": 0.7109375,
      "reward_std": 0.49370861053466797,
      "rewards/accuracy_reward/mean": 0.3203125,
      "rewards/accuracy_reward/std": 0.4670529365539551,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.78125,
      "rewards/soft_format_reward/std": 0.41380295157432556,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.072265625,
      "completions/max_length": 6239.0,
      "completions/max_terminated_length": 6239.0,
      "completions/mean_length": 811.419921875,
      "completions/mean_terminated_length": 874.625244140625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.8735800659582265,
      "grad_norm": 1.3311003300904964,
      "learning_rate": 1e-06,
      "loss": -0.105,
      "num_tokens": 224603297.0,
      "reward": 0.5322265625,
      "reward_std": 0.42388588190078735,
      "rewards/accuracy_reward/mean": 0.185546875,
      "rewards/accuracy_reward/std": 0.38912075757980347,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.693359375,
      "rewards/soft_format_reward/std": 0.4615498185157776,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05078125,
      "completions/max_length": 7076.0,
      "completions/max_terminated_length": 7076.0,
      "completions/mean_length": 797.146484375,
      "completions/mean_terminated_length": 839.7921752929688,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.875534383779162,
      "grad_norm": 1.7947968239974508,
      "learning_rate": 1e-06,
      "loss": -0.1187,
      "num_tokens": 225086988.0,
      "reward": 0.58203125,
      "reward_std": 0.4251861572265625,
      "rewards/accuracy_reward/mean": 0.22265625,
      "rewards/accuracy_reward/std": 0.41643625497817993,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.71875,
      "rewards/soft_format_reward/std": 0.45004892349243164,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.083984375,
      "completions/max_length": 7496.0,
      "completions/max_terminated_length": 7496.0,
      "completions/mean_length": 729.158203125,
      "completions/mean_terminated_length": 796.0106811523438,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.8774887016000977,
      "grad_norm": 1.7079583135439507,
      "learning_rate": 1e-06,
      "loss": -0.1622,
      "num_tokens": 225540973.0,
      "reward": 0.66796875,
      "reward_std": 0.4337148666381836,
      "rewards/accuracy_reward/mean": 0.294921875,
      "rewards/accuracy_reward/std": 0.4564536213874817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.74609375,
      "rewards/soft_format_reward/std": 0.43567025661468506,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.087890625,
      "completions/max_length": 6698.0,
      "completions/max_terminated_length": 6698.0,
      "completions/mean_length": 728.2890625,
      "completions/mean_terminated_length": 798.466796875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.8794430194210333,
      "grad_norm": 1.5059900902257461,
      "learning_rate": 1e-06,
      "loss": -0.1841,
      "num_tokens": 225989025.0,
      "reward": 0.70703125,
      "reward_std": 0.3947368264198303,
      "rewards/accuracy_reward/mean": 0.3046875,
      "rewards/accuracy_reward/std": 0.4607250988483429,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.8046875,
      "rewards/soft_format_reward/std": 0.3968288004398346,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05078125,
      "completions/max_length": 6558.0,
      "completions/max_terminated_length": 6558.0,
      "completions/mean_length": 769.501953125,
      "completions/mean_terminated_length": 810.668701171875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.881397337241969,
      "grad_norm": 1.7148878153209133,
      "learning_rate": 1e-06,
      "loss": -0.0692,
      "num_tokens": 226456242.0,
      "reward": 0.9013671875,
      "reward_std": 0.4203614592552185,
      "rewards/accuracy_reward/mean": 0.45703125,
      "rewards/accuracy_reward/std": 0.49863746762275696,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.888671875,
      "rewards/soft_format_reward/std": 0.31484565138816833,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.041015625,
      "completions/max_length": 4988.0,
      "completions/max_terminated_length": 4988.0,
      "completions/mean_length": 842.943359375,
      "completions/mean_terminated_length": 878.9959716796875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.8833516550629046,
      "grad_norm": 1.7103988708178688,
      "learning_rate": 1e-06,
      "loss": -0.0979,
      "num_tokens": 226964917.0,
      "reward": 0.83984375,
      "reward_std": 0.4147023558616638,
      "rewards/accuracy_reward/mean": 0.388671875,
      "rewards/accuracy_reward/std": 0.4879252314567566,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.90234375,
      "rewards/soft_format_reward/std": 0.29713961482048035,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.060546875,
      "completions/max_length": 6793.0,
      "completions/max_terminated_length": 6793.0,
      "completions/mean_length": 755.958984375,
      "completions/mean_terminated_length": 804.6798095703125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.8853059728838403,
      "grad_norm": 1.2153991907420474,
      "learning_rate": 1e-06,
      "loss": -0.1361,
      "num_tokens": 227430064.0,
      "reward": 0.728515625,
      "reward_std": 0.41981327533721924,
      "rewards/accuracy_reward/mean": 0.29296875,
      "rewards/accuracy_reward/std": 0.455569326877594,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.87109375,
      "rewards/soft_format_reward/std": 0.33542385697364807,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04296875,
      "completions/max_length": 6801.0,
      "completions/max_terminated_length": 6801.0,
      "completions/mean_length": 829.00390625,
      "completions/mean_terminated_length": 866.2244262695312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.8872602907047759,
      "grad_norm": 1.0143662061273302,
      "learning_rate": 1e-06,
      "loss": -0.0666,
      "num_tokens": 227941026.0,
      "reward": 0.7861328125,
      "reward_std": 0.35778701305389404,
      "rewards/accuracy_reward/mean": 0.330078125,
      "rewards/accuracy_reward/std": 0.47070086002349854,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.912109375,
      "rewards/soft_format_reward/std": 0.2834126651287079,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05859375,
      "completions/max_length": 7529.0,
      "completions/max_terminated_length": 7529.0,
      "completions/mean_length": 816.46484375,
      "completions/mean_terminated_length": 867.2822265625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 0.8892146085257115,
      "grad_norm": 1.74482817636668,
      "learning_rate": 1e-06,
      "loss": -0.1584,
      "num_tokens": 228430320.0,
      "reward": 0.7958984375,
      "reward_std": 0.42428719997406006,
      "rewards/accuracy_reward/mean": 0.34375,
      "rewards/accuracy_reward/std": 0.4754233956336975,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.904296875,
      "rewards/soft_format_reward/std": 0.2944713830947876,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 7920.0,
      "completions/max_terminated_length": 7920.0,
      "completions/mean_length": 740.548828125,
      "completions/mean_terminated_length": 776.96923828125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.8911689263466471,
      "grad_norm": 1.4782964756227825,
      "learning_rate": 1e-06,
      "loss": -0.1101,
      "num_tokens": 228876761.0,
      "reward": 0.712890625,
      "reward_std": 0.30588382482528687,
      "rewards/accuracy_reward/mean": 0.25,
      "rewards/accuracy_reward/std": 0.43343618512153625,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.92578125,
      "rewards/soft_format_reward/std": 0.2623828947544098,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025390625,
      "completions/max_length": 6767.0,
      "completions/max_terminated_length": 6767.0,
      "completions/mean_length": 855.7265625,
      "completions/mean_terminated_length": 878.0200805664062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.8931232441675827,
      "grad_norm": 0.7929241327750952,
      "learning_rate": 1e-06,
      "loss": -0.0717,
      "num_tokens": 229395565.0,
      "reward": 0.8359375,
      "reward_std": 0.3727160096168518,
      "rewards/accuracy_reward/mean": 0.3515625,
      "rewards/accuracy_reward/std": 0.4779251217842102,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.96875,
      "rewards/soft_format_reward/std": 0.17416280508041382,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017578125,
      "completions/max_length": 8164.0,
      "completions/max_terminated_length": 8164.0,
      "completions/mean_length": 845.150390625,
      "completions/mean_terminated_length": 860.2723388671875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 340.0,
      "epoch": 0.8950775619885184,
      "grad_norm": 0.7768193705441528,
      "learning_rate": 1e-06,
      "loss": -0.0476,
      "num_tokens": 229903898.0,
      "reward": 0.9296875,
      "reward_std": 0.40889424085617065,
      "rewards/accuracy_reward/mean": 0.443359375,
      "rewards/accuracy_reward/std": 0.49726733565330505,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.97265625,
      "rewards/soft_format_reward/std": 0.16324250400066376,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 5151.0,
      "completions/max_terminated_length": 5151.0,
      "completions/mean_length": 929.96484375,
      "completions/mean_terminated_length": 939.1361083984375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.897031879809454,
      "grad_norm": 0.7104551416698567,
      "learning_rate": 1e-06,
      "loss": -0.0456,
      "num_tokens": 230452392.0,
      "reward": 0.732421875,
      "reward_std": 0.3058362901210785,
      "rewards/accuracy_reward/mean": 0.2421875,
      "rewards/accuracy_reward/std": 0.42882615327835083,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 3596.0,
      "completions/max_terminated_length": 3596.0,
      "completions/mean_length": 830.177734375,
      "completions/mean_terminated_length": 848.4052124023438,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.8989861976303897,
      "grad_norm": 0.7393593831490892,
      "learning_rate": 1e-06,
      "loss": -0.0349,
      "num_tokens": 230953843.0,
      "reward": 0.880859375,
      "reward_std": 0.3243863880634308,
      "rewards/accuracy_reward/mean": 0.396484375,
      "rewards/accuracy_reward/std": 0.4896455705165863,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.96875,
      "rewards/soft_format_reward/std": 0.17416280508041382,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 6314.0,
      "completions/max_terminated_length": 6314.0,
      "completions/mean_length": 907.875,
      "completions/mean_terminated_length": 916.8284301757812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.9009405154513253,
      "grad_norm": 0.5389355879892459,
      "learning_rate": 1e-06,
      "loss": -0.0437,
      "num_tokens": 231491571.0,
      "reward": 0.9267578125,
      "reward_std": 0.33239421248435974,
      "rewards/accuracy_reward/mean": 0.435546875,
      "rewards/accuracy_reward/std": 0.49631330370903015,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.982421875,
      "rewards/soft_format_reward/std": 0.13154059648513794,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 6925.0,
      "completions/max_terminated_length": 6925.0,
      "completions/mean_length": 864.8828125,
      "completions/mean_terminated_length": 883.8722534179688,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.9028948332722609,
      "grad_norm": 0.6214866889272922,
      "learning_rate": 1e-06,
      "loss": -0.0585,
      "num_tokens": 232005927.0,
      "reward": 0.8662109375,
      "reward_std": 0.3578924536705017,
      "rewards/accuracy_reward/mean": 0.380859375,
      "rewards/accuracy_reward/std": 0.48607301712036133,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.970703125,
      "rewards/soft_format_reward/std": 0.16880230605602264,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 5988.0,
      "completions/max_terminated_length": 5988.0,
      "completions/mean_length": 878.0234375,
      "completions/mean_terminated_length": 891.9603881835938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.9048491510931965,
      "grad_norm": 0.5965500887413218,
      "learning_rate": 1e-06,
      "loss": -0.053,
      "num_tokens": 232529299.0,
      "reward": 0.9443359375,
      "reward_std": 0.38830047845840454,
      "rewards/accuracy_reward/mean": 0.455078125,
      "rewards/accuracy_reward/std": 0.4984649419784546,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.978515625,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 2278.0,
      "completions/max_terminated_length": 2278.0,
      "completions/mean_length": 734.92578125,
      "completions/mean_terminated_length": 742.173583984375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.9068034689141322,
      "grad_norm": 0.6019906833328859,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 232966173.0,
      "reward": 1.1201171875,
      "reward_std": 0.2919842600822449,
      "rewards/accuracy_reward/mean": 0.625,
      "rewards/accuracy_reward/std": 0.4845963716506958,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013671875,
      "completions/max_length": 2833.0,
      "completions/max_terminated_length": 2833.0,
      "completions/mean_length": 776.91796875,
      "completions/mean_terminated_length": 787.6871337890625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.9087577867350678,
      "grad_norm": 0.5215486200911099,
      "learning_rate": 1e-06,
      "loss": -0.0437,
      "num_tokens": 233428259.0,
      "reward": 1.07421875,
      "reward_std": 0.2650104761123657,
      "rewards/accuracy_reward/mean": 0.583984375,
      "rewards/accuracy_reward/std": 0.493378221988678,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 7633.0,
      "completions/max_terminated_length": 7633.0,
      "completions/mean_length": 762.11328125,
      "completions/mean_terminated_length": 774.2103881835938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.9107121045560034,
      "grad_norm": 0.5741432760093643,
      "learning_rate": 1e-06,
      "loss": -0.0257,
      "num_tokens": 233880493.0,
      "reward": 1.072265625,
      "reward_std": 0.37611234188079834,
      "rewards/accuracy_reward/mean": 0.580078125,
      "rewards/accuracy_reward/std": 0.4940285086631775,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.984375,
      "rewards/soft_format_reward/std": 0.12414088100194931,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03515625,
      "completions/max_length": 7668.0,
      "completions/max_terminated_length": 7668.0,
      "completions/mean_length": 717.677734375,
      "completions/mean_terminated_length": 743.8279418945312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.9126664223769391,
      "grad_norm": 0.6780032338708568,
      "learning_rate": 1e-06,
      "loss": -0.0732,
      "num_tokens": 234307736.0,
      "reward": 0.943359375,
      "reward_std": 0.3413737416267395,
      "rewards/accuracy_reward/mean": 0.462890625,
      "rewards/accuracy_reward/std": 0.4991086423397064,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9609375,
      "rewards/soft_format_reward/std": 0.1939331740140915,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.037109375,
      "completions/max_length": 5676.0,
      "completions/max_terminated_length": 5676.0,
      "completions/mean_length": 669.8046875,
      "completions/mean_terminated_length": 695.61865234375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.9146207401978746,
      "grad_norm": 0.9422764202961365,
      "learning_rate": 1e-06,
      "loss": -0.0675,
      "num_tokens": 234715908.0,
      "reward": 1.029296875,
      "reward_std": 0.3299490213394165,
      "rewards/accuracy_reward/mean": 0.548828125,
      "rewards/accuracy_reward/std": 0.498096764087677,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9609375,
      "rewards/soft_format_reward/std": 0.1939331740140915,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017578125,
      "completions/max_length": 5438.0,
      "completions/max_terminated_length": 5438.0,
      "completions/mean_length": 777.9296875,
      "completions/mean_terminated_length": 791.848876953125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.9165750580188103,
      "grad_norm": 0.5789160710563247,
      "learning_rate": 1e-06,
      "loss": -0.0271,
      "num_tokens": 235178544.0,
      "reward": 1.107421875,
      "reward_std": 0.31259697675704956,
      "rewards/accuracy_reward/mean": 0.6171875,
      "rewards/accuracy_reward/std": 0.486548513174057,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 7111.0,
      "completions/max_terminated_length": 7111.0,
      "completions/mean_length": 816.837890625,
      "completions/mean_terminated_length": 850.0426635742188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.9185293758397459,
      "grad_norm": 0.5758210097295741,
      "learning_rate": 1e-06,
      "loss": -0.0602,
      "num_tokens": 235664669.0,
      "reward": 0.9736328125,
      "reward_std": 0.2642187774181366,
      "rewards/accuracy_reward/mean": 0.4921875,
      "rewards/accuracy_reward/std": 0.5004279017448425,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.962890625,
      "rewards/soft_format_reward/std": 0.18921469151973724,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 7104.0,
      "completions/max_terminated_length": 7104.0,
      "completions/mean_length": 674.55859375,
      "completions/mean_terminated_length": 689.3692626953125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.9204836936606816,
      "grad_norm": 0.6497836324935389,
      "learning_rate": 1e-06,
      "loss": -0.0393,
      "num_tokens": 236069947.0,
      "reward": 1.0791015625,
      "reward_std": 0.3011692762374878,
      "rewards/accuracy_reward/mean": 0.591796875,
      "rewards/accuracy_reward/std": 0.49198177456855774,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.974609375,
      "rewards/soft_format_reward/std": 0.15746226906776428,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 5687.0,
      "completions/max_terminated_length": 5687.0,
      "completions/mean_length": 772.49609375,
      "completions/mean_terminated_length": 789.4570922851562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.9224380114816172,
      "grad_norm": 0.5783808120324807,
      "learning_rate": 1e-06,
      "loss": -0.0454,
      "num_tokens": 236528537.0,
      "reward": 1.0234375,
      "reward_std": 0.30921226739883423,
      "rewards/accuracy_reward/mean": 0.53515625,
      "rewards/accuracy_reward/std": 0.49925029277801514,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9765625,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 3878.0,
      "completions/max_terminated_length": 3878.0,
      "completions/mean_length": 701.900390625,
      "completions/mean_terminated_length": 736.4200439453125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.9243923293025528,
      "grad_norm": 0.5515443128324211,
      "learning_rate": 1e-06,
      "loss": -0.0806,
      "num_tokens": 236947494.0,
      "reward": 1.03125,
      "reward_std": 0.3871540427207947,
      "rewards/accuracy_reward/mean": 0.556640625,
      "rewards/accuracy_reward/std": 0.49726733565330505,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.94921875,
      "rewards/soft_format_reward/std": 0.21976542472839355,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 7876.0,
      "completions/max_terminated_length": 7876.0,
      "completions/mean_length": 780.8359375,
      "completions/mean_terminated_length": 847.0084838867188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.9263466471234885,
      "grad_norm": 0.9965213328415409,
      "learning_rate": 1e-06,
      "loss": -0.1591,
      "num_tokens": 237419074.0,
      "reward": 0.994140625,
      "reward_std": 0.3784325122833252,
      "rewards/accuracy_reward/mean": 0.5390625,
      "rewards/accuracy_reward/std": 0.4989593029022217,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.91015625,
      "rewards/soft_format_reward/std": 0.2862374484539032,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.037109375,
      "completions/max_length": 7124.0,
      "completions/max_terminated_length": 7124.0,
      "completions/mean_length": 888.017578125,
      "completions/mean_terminated_length": 922.2413330078125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.928300964944424,
      "grad_norm": 0.9802810434685564,
      "learning_rate": 1e-06,
      "loss": -0.0565,
      "num_tokens": 237948235.0,
      "reward": 0.9951171875,
      "reward_std": 0.3843446969985962,
      "rewards/accuracy_reward/mean": 0.515625,
      "rewards/accuracy_reward/std": 0.5002445578575134,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.958984375,
      "rewards/soft_format_reward/std": 0.19852031767368317,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.091796875,
      "completions/max_length": 8153.0,
      "completions/max_terminated_length": 8153.0,
      "completions/mean_length": 762.35546875,
      "completions/mean_terminated_length": 839.4107666015625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 0.9302552827653597,
      "grad_norm": 0.8744620153789759,
      "learning_rate": 1e-06,
      "loss": -0.1564,
      "num_tokens": 238407985.0,
      "reward": 0.982421875,
      "reward_std": 0.4088676869869232,
      "rewards/accuracy_reward/mean": 0.53125,
      "rewards/accuracy_reward/std": 0.4995105266571045,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.90234375,
      "rewards/soft_format_reward/std": 0.29713961482048035,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0546875,
      "completions/max_length": 7231.0,
      "completions/max_terminated_length": 7231.0,
      "completions/mean_length": 794.966796875,
      "completions/mean_terminated_length": 840.9566040039062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 253.0,
      "epoch": 0.9322096005862953,
      "grad_norm": 0.6613375246696646,
      "learning_rate": 1e-06,
      "loss": -0.1236,
      "num_tokens": 238878544.0,
      "reward": 1.0673828125,
      "reward_std": 0.2954113483428955,
      "rewards/accuracy_reward/mean": 0.595703125,
      "rewards/accuracy_reward/std": 0.4912354052066803,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.943359375,
      "rewards/soft_format_reward/std": 0.23138070106506348,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.07421875,
      "completions/max_length": 7122.0,
      "completions/max_terminated_length": 7122.0,
      "completions/mean_length": 810.68359375,
      "completions/mean_terminated_length": 875.675048828125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.934163918407231,
      "grad_norm": 0.9317973658296996,
      "learning_rate": 1e-06,
      "loss": -0.1198,
      "num_tokens": 239358910.0,
      "reward": 0.95703125,
      "reward_std": 0.46614953875541687,
      "rewards/accuracy_reward/mean": 0.49609375,
      "rewards/accuracy_reward/std": 0.5004737377166748,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.921875,
      "rewards/soft_format_reward/std": 0.26863065361976624,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.08203125,
      "completions/max_length": 8120.0,
      "completions/max_terminated_length": 8120.0,
      "completions/mean_length": 823.8203125,
      "completions/mean_terminated_length": 897.4382934570312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.9361182362281666,
      "grad_norm": 0.9342506130221416,
      "learning_rate": 1e-06,
      "loss": -0.1382,
      "num_tokens": 239844674.0,
      "reward": 0.9140625,
      "reward_std": 0.39317864179611206,
      "rewards/accuracy_reward/mean": 0.45703125,
      "rewards/accuracy_reward/std": 0.49863746762275696,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9140625,
      "rewards/soft_format_reward/std": 0.28054583072662354,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.06640625,
      "completions/max_length": 7618.0,
      "completions/max_terminated_length": 7618.0,
      "completions/mean_length": 835.5546875,
      "completions/mean_terminated_length": 894.9874267578125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.9380725540491023,
      "grad_norm": 0.5430348666668907,
      "learning_rate": 1e-06,
      "loss": -0.1157,
      "num_tokens": 240339150.0,
      "reward": 0.951171875,
      "reward_std": 0.36654937267303467,
      "rewards/accuracy_reward/mean": 0.484375,
      "rewards/accuracy_reward/std": 0.5002445578575134,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.93359375,
      "rewards/soft_format_reward/std": 0.2492343932390213,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 8184.0,
      "completions/max_terminated_length": 8184.0,
      "completions/mean_length": 922.828125,
      "completions/mean_terminated_length": 943.08984375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.9400268718700379,
      "grad_norm": 0.6480506259793776,
      "learning_rate": 1e-06,
      "loss": -0.0145,
      "num_tokens": 240874614.0,
      "reward": 0.9296875,
      "reward_std": 0.34232279658317566,
      "rewards/accuracy_reward/mean": 0.44140625,
      "rewards/accuracy_reward/std": 0.4970405399799347,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9765625,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021484375,
      "completions/max_length": 7655.0,
      "completions/max_terminated_length": 7655.0,
      "completions/mean_length": 949.119140625,
      "completions/mean_terminated_length": 969.9580688476562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.9419811896909734,
      "grad_norm": 0.5093417529520246,
      "learning_rate": 1e-06,
      "loss": -0.0184,
      "num_tokens": 241426579.0,
      "reward": 0.9365234375,
      "reward_std": 0.3903234004974365,
      "rewards/accuracy_reward/mean": 0.447265625,
      "rewards/accuracy_reward/std": 0.4976975917816162,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.978515625,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03515625,
      "completions/max_length": 7506.0,
      "completions/max_terminated_length": 7506.0,
      "completions/mean_length": 969.109375,
      "completions/mean_terminated_length": 1004.4210815429688,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.9439355075119091,
      "grad_norm": 0.6694203730648159,
      "learning_rate": 1e-06,
      "loss": -0.0575,
      "num_tokens": 241986443.0,
      "reward": 0.9892578125,
      "reward_std": 0.4110221266746521,
      "rewards/accuracy_reward/mean": 0.5078125,
      "rewards/accuracy_reward/std": 0.5004279017448425,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.962890625,
      "rewards/soft_format_reward/std": 0.18921469151973724,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044921875,
      "completions/max_length": 5962.0,
      "completions/max_terminated_length": 5962.0,
      "completions/mean_length": 784.2421875,
      "completions/mean_terminated_length": 821.1288452148438,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.9458898253328447,
      "grad_norm": 0.46927479618168727,
      "learning_rate": 1e-06,
      "loss": -0.0903,
      "num_tokens": 242447735.0,
      "reward": 1.1123046875,
      "reward_std": 0.32275721430778503,
      "rewards/accuracy_reward/mean": 0.63671875,
      "rewards/accuracy_reward/std": 0.4814152419567108,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.951171875,
      "rewards/soft_format_reward/std": 0.2157193273305893,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025390625,
      "completions/max_length": 5921.0,
      "completions/max_terminated_length": 5921.0,
      "completions/mean_length": 893.67578125,
      "completions/mean_terminated_length": 916.9579467773438,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.9478441431537804,
      "grad_norm": 0.535487662744918,
      "learning_rate": 1e-06,
      "loss": -0.0351,
      "num_tokens": 242969537.0,
      "reward": 0.857421875,
      "reward_std": 0.3181568682193756,
      "rewards/accuracy_reward/mean": 0.37109375,
      "rewards/accuracy_reward/std": 0.4835699498653412,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.97265625,
      "rewards/soft_format_reward/std": 0.16324250400066376,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01953125,
      "completions/max_length": 2214.0,
      "completions/max_terminated_length": 2214.0,
      "completions/mean_length": 941.484375,
      "completions/mean_terminated_length": 960.2390747070312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.949798460974716,
      "grad_norm": 0.46893495952048375,
      "learning_rate": 1e-06,
      "loss": -0.0519,
      "num_tokens": 243518537.0,
      "reward": 1.146484375,
      "reward_std": 0.32323187589645386,
      "rewards/accuracy_reward/mean": 0.65625,
      "rewards/accuracy_reward/std": 0.4754233956336975,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04296875,
      "completions/max_length": 7516.0,
      "completions/max_terminated_length": 7516.0,
      "completions/mean_length": 965.45703125,
      "completions/mean_terminated_length": 1008.8040161132812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.9517527787956517,
      "grad_norm": 0.6311387421030279,
      "learning_rate": 1e-06,
      "loss": -0.0614,
      "num_tokens": 244086067.0,
      "reward": 1.11328125,
      "reward_std": 0.37617602944374084,
      "rewards/accuracy_reward/mean": 0.634765625,
      "rewards/accuracy_reward/std": 0.4819667339324951,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.95703125,
      "rewards/soft_format_reward/std": 0.2029850035905838,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.080078125,
      "completions/max_length": 7478.0,
      "completions/max_terminated_length": 7478.0,
      "completions/mean_length": 803.916015625,
      "completions/mean_terminated_length": 873.89599609375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 0.9537070966165873,
      "grad_norm": 1.0897314012512074,
      "learning_rate": 1e-06,
      "loss": -0.1505,
      "num_tokens": 244558680.0,
      "reward": 0.916015625,
      "reward_std": 0.37017822265625,
      "rewards/accuracy_reward/mean": 0.4609375,
      "rewards/accuracy_reward/std": 0.4989593029022217,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.91015625,
      "rewards/soft_format_reward/std": 0.2862374484539032,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.103515625,
      "completions/max_length": 7676.0,
      "completions/max_terminated_length": 7676.0,
      "completions/mean_length": 804.150390625,
      "completions/mean_terminated_length": 897.0043334960938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.955661414437523,
      "grad_norm": 0.6848232002649514,
      "learning_rate": 1e-06,
      "loss": -0.2141,
      "num_tokens": 245035157.0,
      "reward": 1.048828125,
      "reward_std": 0.4050791561603546,
      "rewards/accuracy_reward/mean": 0.6015625,
      "rewards/accuracy_reward/std": 0.4900552034378052,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.89453125,
      "rewards/soft_format_reward/std": 0.3074568510055542,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09765625,
      "completions/max_length": 6959.0,
      "completions/max_terminated_length": 6959.0,
      "completions/mean_length": 925.359375,
      "completions/mean_terminated_length": 1025.5064697265625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 0.9576157322584585,
      "grad_norm": 0.5541681981586442,
      "learning_rate": 1e-06,
      "loss": -0.1795,
      "num_tokens": 245574269.0,
      "reward": 0.8955078125,
      "reward_std": 0.4295736849308014,
      "rewards/accuracy_reward/mean": 0.447265625,
      "rewards/accuracy_reward/std": 0.4976975917816162,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.896484375,
      "rewards/soft_format_reward/std": 0.30492907762527466,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 8133.0,
      "completions/max_terminated_length": 8133.0,
      "completions/mean_length": 907.287109375,
      "completions/mean_terminated_length": 951.90771484375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 333.0,
      "epoch": 0.9595700500793941,
      "grad_norm": 0.4985487593719111,
      "learning_rate": 1e-06,
      "loss": -0.0809,
      "num_tokens": 246104832.0,
      "reward": 1.01171875,
      "reward_std": 0.3561217486858368,
      "rewards/accuracy_reward/mean": 0.541015625,
      "rewards/accuracy_reward/std": 0.49880221486091614,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.94140625,
      "rewards/soft_format_reward/std": 0.23509246110916138,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.103515625,
      "completions/max_length": 7933.0,
      "completions/max_terminated_length": 7933.0,
      "completions/mean_length": 795.931640625,
      "completions/mean_terminated_length": 887.8366088867188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.9615243679003298,
      "grad_norm": 0.9344839438192083,
      "learning_rate": 1e-06,
      "loss": -0.1672,
      "num_tokens": 246576221.0,
      "reward": 0.8955078125,
      "reward_std": 0.3685035705566406,
      "rewards/accuracy_reward/mean": 0.44921875,
      "rewards/accuracy_reward/std": 0.497901052236557,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.892578125,
      "rewards/soft_format_reward/std": 0.30995169281959534,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.091796875,
      "completions/max_length": 7176.0,
      "completions/max_terminated_length": 7176.0,
      "completions/mean_length": 822.322265625,
      "completions/mean_terminated_length": 905.438720703125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.9634786857212654,
      "grad_norm": 0.5695672312178879,
      "learning_rate": 1e-06,
      "loss": -0.1495,
      "num_tokens": 247062242.0,
      "reward": 0.9580078125,
      "reward_std": 0.33883267641067505,
      "rewards/accuracy_reward/mean": 0.505859375,
      "rewards/accuracy_reward/std": 0.5004546642303467,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.904296875,
      "rewards/soft_format_reward/std": 0.2944713830947876,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.048828125,
      "completions/max_length": 5793.0,
      "completions/max_terminated_length": 5793.0,
      "completions/mean_length": 856.931640625,
      "completions/mean_terminated_length": 900.9219970703125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.9654330035422011,
      "grad_norm": 0.4524615129397472,
      "learning_rate": 1e-06,
      "loss": -0.0749,
      "num_tokens": 247560207.0,
      "reward": 0.9912109375,
      "reward_std": 0.3110504746437073,
      "rewards/accuracy_reward/mean": 0.517578125,
      "rewards/accuracy_reward/std": 0.5001795887947083,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.947265625,
      "rewards/soft_format_reward/std": 0.22372129559516907,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 6020.0,
      "completions/max_terminated_length": 6020.0,
      "completions/mean_length": 733.865234375,
      "completions/mean_terminated_length": 763.6971435546875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 301.0,
      "epoch": 0.9673873213631367,
      "grad_norm": 0.5168983525550634,
      "learning_rate": 1e-06,
      "loss": -0.0714,
      "num_tokens": 247996186.0,
      "reward": 1.109375,
      "reward_std": 0.34613001346588135,
      "rewards/accuracy_reward/mean": 0.62890625,
      "rewards/accuracy_reward/std": 0.4835699498653412,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9609375,
      "rewards/soft_format_reward/std": 0.1939331740140915,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 7130.0,
      "completions/max_terminated_length": 7130.0,
      "completions/mean_length": 857.07421875,
      "completions/mean_terminated_length": 891.9146118164062,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 0.9693416391840723,
      "grad_norm": 0.49569564507491365,
      "learning_rate": 1e-06,
      "loss": -0.0839,
      "num_tokens": 248501696.0,
      "reward": 1.06640625,
      "reward_std": 0.3585534989833832,
      "rewards/accuracy_reward/mean": 0.5859375,
      "rewards/accuracy_reward/std": 0.49304109811782837,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9609375,
      "rewards/soft_format_reward/std": 0.1939331740140915,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01953125,
      "completions/max_length": 5489.0,
      "completions/max_terminated_length": 5489.0,
      "completions/mean_length": 876.552734375,
      "completions/mean_terminated_length": 894.0139770507812,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.9712959570050079,
      "grad_norm": 0.6147626643385528,
      "learning_rate": 1e-06,
      "loss": -0.0312,
      "num_tokens": 249010459.0,
      "reward": 1.0478515625,
      "reward_std": 0.35256725549697876,
      "rewards/accuracy_reward/mean": 0.55859375,
      "rewards/accuracy_reward/std": 0.4970405399799347,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.978515625,
      "rewards/soft_format_reward/std": 0.14513419568538666,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 6724.0,
      "completions/max_terminated_length": 6724.0,
      "completions/mean_length": 920.267578125,
      "completions/mean_terminated_length": 949.95361328125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.9732502748259436,
      "grad_norm": 0.5261575537054536,
      "learning_rate": 1e-06,
      "loss": -0.0535,
      "num_tokens": 249542932.0,
      "reward": 0.890625,
      "reward_std": 0.352202832698822,
      "rewards/accuracy_reward/mean": 0.408203125,
      "rewards/accuracy_reward/std": 0.49198177456855774,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.96484375,
      "rewards/soft_format_reward/std": 0.1843547374010086,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.064453125,
      "completions/max_length": 7714.0,
      "completions/max_terminated_length": 7714.0,
      "completions/mean_length": 903.548828125,
      "completions/mean_terminated_length": 965.7975463867188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.9752045926468792,
      "grad_norm": 0.4186029173907592,
      "learning_rate": 1e-06,
      "loss": -0.1213,
      "num_tokens": 250068413.0,
      "reward": 0.9453125,
      "reward_std": 0.31097647547721863,
      "rewards/accuracy_reward/mean": 0.478515625,
      "rewards/accuracy_reward/std": 0.5000267624855042,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.93359375,
      "rewards/soft_format_reward/std": 0.2492343932390213,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05078125,
      "completions/max_length": 8131.0,
      "completions/max_terminated_length": 8131.0,
      "completions/mean_length": 836.07421875,
      "completions/mean_terminated_length": 880.8024291992188,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.9771589104678148,
      "grad_norm": 0.46700975891340746,
      "learning_rate": 1e-06,
      "loss": -0.0894,
      "num_tokens": 250565155.0,
      "reward": 0.990234375,
      "reward_std": 0.3769418001174927,
      "rewards/accuracy_reward/mean": 0.515625,
      "rewards/accuracy_reward/std": 0.5002445578575134,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.94921875,
      "rewards/soft_format_reward/std": 0.21976542472839355,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.052734375,
      "completions/max_length": 7985.0,
      "completions/max_terminated_length": 7985.0,
      "completions/mean_length": 798.17578125,
      "completions/mean_terminated_length": 842.6103515625,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.9791132282887505,
      "grad_norm": 0.5474339883422088,
      "learning_rate": 1e-06,
      "loss": -0.0925,
      "num_tokens": 251042013.0,
      "reward": 0.98828125,
      "reward_std": 0.29229220747947693,
      "rewards/accuracy_reward/mean": 0.515625,
      "rewards/accuracy_reward/std": 0.5002445578575134,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9453125,
      "rewards/soft_format_reward/std": 0.2275916188955307,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05859375,
      "completions/max_length": 8155.0,
      "completions/max_terminated_length": 8155.0,
      "completions/mean_length": 842.6640625,
      "completions/mean_terminated_length": 895.112060546875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 260.0,
      "epoch": 0.981067546109686,
      "grad_norm": 0.5621193326438604,
      "learning_rate": 1e-06,
      "loss": -0.1066,
      "num_tokens": 251542577.0,
      "reward": 1.01171875,
      "reward_std": 0.37604188919067383,
      "rewards/accuracy_reward/mean": 0.541015625,
      "rewards/accuracy_reward/std": 0.49880221486091614,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.94140625,
      "rewards/soft_format_reward/std": 0.23509246110916138,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029296875,
      "completions/max_length": 5010.0,
      "completions/max_terminated_length": 5010.0,
      "completions/mean_length": 668.14453125,
      "completions/mean_terminated_length": 688.309814453125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.9830218639306217,
      "grad_norm": 0.5498844223977205,
      "learning_rate": 1e-06,
      "loss": -0.0264,
      "num_tokens": 251941115.0,
      "reward": 1.1494140625,
      "reward_std": 0.3015892505645752,
      "rewards/accuracy_reward/mean": 0.6640625,
      "rewards/accuracy_reward/std": 0.4727790653705597,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.970703125,
      "rewards/soft_format_reward/std": 0.16880230605602264,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017578125,
      "completions/max_length": 4417.0,
      "completions/max_terminated_length": 4417.0,
      "completions/mean_length": 627.01171875,
      "completions/mean_terminated_length": 638.2305908203125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.9849761817515573,
      "grad_norm": 0.6223520678937557,
      "learning_rate": 1e-06,
      "loss": -0.0194,
      "num_tokens": 252324625.0,
      "reward": 1.12890625,
      "reward_std": 0.27855873107910156,
      "rewards/accuracy_reward/mean": 0.638671875,
      "rewards/accuracy_reward/std": 0.48085519671440125,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025390625,
      "completions/max_length": 3569.0,
      "completions/max_terminated_length": 3569.0,
      "completions/mean_length": 774.861328125,
      "completions/mean_terminated_length": 795.048095703125,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.986930499572493,
      "grad_norm": 0.528956451291311,
      "learning_rate": 1e-06,
      "loss": -0.0397,
      "num_tokens": 252782186.0,
      "reward": 1.0947265625,
      "reward_std": 0.32669395208358765,
      "rewards/accuracy_reward/mean": 0.607421875,
      "rewards/accuracy_reward/std": 0.4888018071651459,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.974609375,
      "rewards/soft_format_reward/std": 0.15746226906776428,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.033203125,
      "completions/max_length": 6655.0,
      "completions/max_terminated_length": 6655.0,
      "completions/mean_length": 857.521484375,
      "completions/mean_terminated_length": 886.9717407226562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 0.9888848173934286,
      "grad_norm": 0.5300826491459741,
      "learning_rate": 1e-06,
      "loss": -0.0529,
      "num_tokens": 253287717.0,
      "reward": 1.1240234375,
      "reward_std": 0.35081547498703003,
      "rewards/accuracy_reward/mean": 0.640625,
      "rewards/accuracy_reward/std": 0.48028653860092163,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.966796875,
      "rewards/soft_format_reward/std": 0.17934183776378632,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03515625,
      "completions/max_length": 6549.0,
      "completions/max_terminated_length": 6549.0,
      "completions/mean_length": 802.947265625,
      "completions/mean_terminated_length": 832.2044677734375,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.9908391352143643,
      "grad_norm": 0.42853276451266314,
      "learning_rate": 1e-06,
      "loss": -0.0491,
      "num_tokens": 253759706.0,
      "reward": 1.1533203125,
      "reward_std": 0.2587732672691345,
      "rewards/accuracy_reward/mean": 0.671875,
      "rewards/accuracy_reward/std": 0.4699897766113281,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.962890625,
      "rewards/soft_format_reward/std": 0.18921469151973724,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 8005.0,
      "completions/max_terminated_length": 8005.0,
      "completions/mean_length": 748.52734375,
      "completions/mean_terminated_length": 766.4920654296875,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.9927934530352999,
      "grad_norm": 0.5564003388206429,
      "learning_rate": 1e-06,
      "loss": -0.0374,
      "num_tokens": 254204776.0,
      "reward": 1.0625,
      "reward_std": 0.3248573839664459,
      "rewards/accuracy_reward/mean": 0.57421875,
      "rewards/accuracy_reward/std": 0.4949444830417633,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.9765625,
      "rewards/soft_format_reward/std": 0.15143637359142303,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.009765625,
      "completions/max_length": 7709.0,
      "completions/max_terminated_length": 7709.0,
      "completions/mean_length": 776.3515625,
      "completions/mean_terminated_length": 784.0078735351562,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.9947477708562354,
      "grad_norm": 0.508090925111072,
      "learning_rate": 1e-06,
      "loss": -0.0219,
      "num_tokens": 254664380.0,
      "reward": 1.1435546875,
      "reward_std": 0.28476428985595703,
      "rewards/accuracy_reward/mean": 0.6484375,
      "rewards/accuracy_reward/std": 0.4779251217842102,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.990234375,
      "rewards/soft_format_reward/std": 0.09843364357948303,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029296875,
      "completions/max_length": 8036.0,
      "completions/max_terminated_length": 8036.0,
      "completions/mean_length": 699.50390625,
      "completions/mean_terminated_length": 720.6156616210938,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.9967020886771711,
      "grad_norm": 0.5967369543635935,
      "learning_rate": 1e-06,
      "loss": -0.0371,
      "num_tokens": 255087710.0,
      "reward": 1.1630859375,
      "reward_std": 0.27583375573158264,
      "rewards/accuracy_reward/mean": 0.677734375,
      "rewards/accuracy_reward/std": 0.46780112385749817,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.970703125,
      "rewards/soft_format_reward/std": 0.16880230605602264,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.019920318725099584,
      "completions/max_length": 7975.0,
      "completions/max_terminated_length": 7975.0,
      "completions/mean_length": 738.486083984375,
      "completions/mean_terminated_length": 753.4959106445312,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 266.0,
      "epoch": 0.9986564064981067,
      "grad_norm": 0.7423001685047017,
      "learning_rate": 1e-06,
      "loss": -0.0269,
      "num_tokens": 255530192.0,
      "reward": 1.150390625,
      "reward_std": 0.3285100758075714,
      "rewards/accuracy_reward/mean": 0.66015625,
      "rewards/accuracy_reward/std": 0.4741191864013672,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/soft_format_reward/mean": 0.98046875,
      "rewards/soft_format_reward/std": 0.1385180652141571,
      "step": 511
    },
    {
      "epoch": 0.9986564064981067,
      "step": 511,
      "total_flos": 0.0,
      "train_loss": -0.00574838798383794,
      "train_runtime": 41865.0435,
      "train_samples_per_second": 0.391,
      "train_steps_per_second": 0.012
    }
  ],
  "logging_steps": 1,
  "max_steps": 511,
  "num_input_tokens_seen": 255530192,
  "num_train_epochs": 1,
  "save_steps": 52,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}