{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9986564064981067, "eval_steps": 500, "global_step": 511, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7912.0, "completions/max_terminated_length": 7912.0, "completions/mean_length": 617.13671875, "completions/mean_terminated_length": 617.13671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0019543178209356295, "grad_norm": 0.5497008291481603, "learning_rate": 0.0, "loss": -0.0172, "num_tokens": 381814.0, "reward": 0.06640625, "reward_std": 0.16296617686748505, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.01953125, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5592.0, "completions/max_terminated_length": 5592.0, "completions/mean_length": 567.18359375, "completions/mean_terminated_length": 567.18359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.003908635641871259, "grad_norm": 2.698078274122091, "learning_rate": 1.923076923076923e-08, "loss": -0.0685, "num_tokens": 747460.0, "reward": 0.0869140625, "reward_std": 0.219474196434021, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.025390625, "rewards/soft_format_reward/std": 0.15746226906776428, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4418.0, "completions/max_terminated_length": 4418.0, "completions/mean_length": 513.427734375, "completions/mean_terminated_length": 513.427734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.005862953462806889, "grad_norm": 5.308384935213732, "learning_rate": 3.846153846153846e-08, "loss": -0.0673, "num_tokens": 1071487.0, "reward": 0.0888671875, "reward_std": 0.185697540640831, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.021484375, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7199.0, "completions/max_terminated_length": 7199.0, "completions/mean_length": 565.57421875, "completions/mean_terminated_length": 565.57421875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.007817271283742518, "grad_norm": 0.48385115003594786, "learning_rate": 5.7692307692307695e-08, "loss": -0.0395, "num_tokens": 1422405.0, "reward": 0.06640625, "reward_std": 0.16206470131874084, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.0078125, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5413.0, "completions/max_terminated_length": 5413.0, "completions/mean_length": 561.673828125, "completions/mean_terminated_length": 561.673828125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.009771589104678149, "grad_norm": 0.381530031845912, "learning_rate": 7.692307692307692e-08, "loss": -0.0071, "num_tokens": 1779006.0, "reward": 0.0263671875, "reward_std": 0.0822526216506958, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.021484375, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7300.0, "completions/max_terminated_length": 7300.0, "completions/mean_length": 612.404296875, "completions/mean_terminated_length": 612.404296875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.011725906925613778, "grad_norm": 0.5102114685216445, "learning_rate": 9.615384615384616e-08, "loss": -0.0678, "num_tokens": 2159213.0, "reward": 0.0498046875, "reward_std": 0.12858566641807556, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.017578125, "rewards/soft_format_reward/std": 0.13154059648513794, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3591.0, "completions/max_terminated_length": 3591.0, "completions/mean_length": 516.505859375, "completions/mean_terminated_length": 516.505859375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.013680224746549407, "grad_norm": 0.6048195767923973, "learning_rate": 1.1538461538461539e-07, "loss": -0.0439, "num_tokens": 2491552.0, "reward": 0.056640625, "reward_std": 0.11580279469490051, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.0234375, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 5877.0, "completions/max_terminated_length": 5877.0, "completions/mean_length": 526.783203125, "completions/mean_terminated_length": 528.8490600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.015634542567485036, "grad_norm": 0.6144749886453214, "learning_rate": 1.346153846153846e-07, "loss": -0.037, "num_tokens": 2833313.0, "reward": 0.029296875, "reward_std": 0.09001073986291885, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.015625, "rewards/soft_format_reward/std": 0.12414088100194931, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4974.0, "completions/max_terminated_length": 4974.0, "completions/mean_length": 525.9140625, "completions/mean_terminated_length": 525.9140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.01758886038842067, "grad_norm": 0.7578893167945104, "learning_rate": 1.5384615384615385e-07, "loss": -0.0363, "num_tokens": 3176469.0, "reward": 0.0400390625, "reward_std": 0.11926878988742828, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.025390625, "rewards/soft_format_reward/std": 0.15746226906776428, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5875.0, "completions/max_terminated_length": 5875.0, "completions/mean_length": 571.982421875, "completions/mean_terminated_length": 571.982421875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.019543178209356298, "grad_norm": 0.505459854966566, "learning_rate": 1.7307692307692305e-07, "loss": -0.0648, "num_tokens": 3538620.0, "reward": 0.03515625, "reward_std": 0.104578398168087, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.01953125, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 5900.0, "completions/max_terminated_length": 5900.0, "completions/mean_length": 553.423828125, "completions/mean_terminated_length": 554.5068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.021497496030291927, "grad_norm": 0.6386183497474042, "learning_rate": 1.9230769230769231e-07, "loss": -0.0297, "num_tokens": 3893797.0, "reward": 0.0400390625, "reward_std": 0.107000432908535, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.017578125, "rewards/soft_format_reward/std": 0.13154059648513794, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6139.0, "completions/max_terminated_length": 6139.0, "completions/mean_length": 554.06640625, "completions/mean_terminated_length": 554.06640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.023451813851227556, "grad_norm": 0.5476698283725961, "learning_rate": 2.1153846153846152e-07, "loss": -0.0111, "num_tokens": 4237223.0, "reward": 0.05078125, "reward_std": 0.12995371222496033, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.0234375, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8168.0, "completions/max_terminated_length": 8168.0, "completions/mean_length": 567.6796875, "completions/mean_terminated_length": 567.6796875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.025406131672163185, "grad_norm": 0.6283143368465841, "learning_rate": 2.3076923076923078e-07, "loss": -0.0371, "num_tokens": 4601331.0, "reward": 0.04296875, "reward_std": 0.1183362603187561, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.01953125, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 3741.0, "completions/max_terminated_length": 3741.0, "completions/mean_length": 513.072265625, "completions/mean_terminated_length": 514.0762939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.027360449493098814, "grad_norm": 3.1847469546145972, "learning_rate": 2.5e-07, "loss": -0.0305, "num_tokens": 4934456.0, "reward": 0.05859375, "reward_std": 0.15280833840370178, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.01953125, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 6554.0, "completions/max_terminated_length": 6554.0, "completions/mean_length": 608.732421875, "completions/mean_terminated_length": 609.9236450195312, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.029314767314034446, "grad_norm": 0.5723525274669647, "learning_rate": 2.692307692307692e-07, "loss": -0.0246, "num_tokens": 5323775.0, "reward": 0.07421875, "reward_std": 0.15633760392665863, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.03125, "rewards/soft_format_reward/std": 0.17416280508041382, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 7342.0, "completions/max_terminated_length": 7342.0, "completions/mean_length": 609.19140625, "completions/mean_terminated_length": 611.5804443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.03126908513497007, "grad_norm": 0.4570532944391167, "learning_rate": 2.884615384615384e-07, "loss": 0.018, "num_tokens": 5713905.0, "reward": 0.0537109375, "reward_std": 0.12987464666366577, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.025390625, "rewards/soft_format_reward/std": 0.15746226906776428, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6857.0, "completions/max_terminated_length": 6857.0, "completions/mean_length": 606.8046875, "completions/mean_terminated_length": 606.8046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0332234029559057, "grad_norm": 0.7214971252015325, "learning_rate": 3.076923076923077e-07, "loss": -0.028, "num_tokens": 6092541.0, "reward": 0.0693359375, "reward_std": 0.15594926476478577, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.029296875, "rewards/soft_format_reward/std": 0.16880230605602264, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6450.0, "completions/max_terminated_length": 6450.0, "completions/mean_length": 570.732421875, "completions/mean_terminated_length": 570.732421875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.03517772077684134, "grad_norm": 0.8363578457680537, "learning_rate": 3.269230769230769e-07, "loss": -0.0084, "num_tokens": 6452228.0, "reward": 0.0400390625, "reward_std": 0.13050393760204315, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.033203125, "rewards/soft_format_reward/std": 0.17934183776378632, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 7415.0, "completions/max_terminated_length": 7415.0, "completions/mean_length": 660.81640625, "completions/mean_terminated_length": 663.4078979492188, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.037132038597776966, "grad_norm": 1.8442167181929248, "learning_rate": 3.461538461538461e-07, "loss": -0.0545, "num_tokens": 6862838.0, "reward": 0.048828125, "reward_std": 0.14615076780319214, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.046875, "rewards/soft_format_reward/std": 0.21157780289649963, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 499.787109375, "completions/mean_terminated_length": 499.787109375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.039086356418712595, "grad_norm": 15.273262941735497, "learning_rate": 3.6538461538461534e-07, "loss": -0.0211, "num_tokens": 7198249.0, "reward": 0.078125, "reward_std": 0.2060895711183548, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.05078125, "rewards/soft_format_reward/std": 0.21976542472839355, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 5856.0, "completions/max_terminated_length": 5856.0, "completions/mean_length": 687.306640625, "completions/mean_terminated_length": 688.6516723632812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.041040674239648224, "grad_norm": 5.048231501526631, "learning_rate": 3.8461538461538463e-07, "loss": -0.0082, "num_tokens": 7618006.0, "reward": 0.10546875, "reward_std": 0.22821679711341858, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.0625, "rewards/soft_format_reward/std": 0.2422981858253479, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6152.0, "completions/max_terminated_length": 6152.0, "completions/mean_length": 708.736328125, "completions/mean_terminated_length": 708.736328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04299499206058385, "grad_norm": 0.4512712553803841, "learning_rate": 4.0384615384615386e-07, "loss": 0.0034, "num_tokens": 8046383.0, "reward": 0.07421875, "reward_std": 0.17045770585536957, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.04296875, "rewards/soft_format_reward/std": 0.2029850035905838, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 5840.0, "completions/max_terminated_length": 5840.0, "completions/mean_length": 687.8046875, "completions/mean_terminated_length": 689.1506958007812, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.04494930988151948, "grad_norm": 2.43420844870162, "learning_rate": 4.2307692307692304e-07, "loss": -0.0204, "num_tokens": 8472651.0, "reward": 0.087890625, "reward_std": 0.19242921471595764, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.05078125, "rewards/soft_format_reward/std": 0.21976542472839355, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7117.0, "completions/max_terminated_length": 7117.0, "completions/mean_length": 681.640625, "completions/mean_terminated_length": 681.640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04690362770245511, "grad_norm": 1.4085772264361187, "learning_rate": 4.423076923076923e-07, "loss": -0.0008, "num_tokens": 8892259.0, "reward": 0.0849609375, "reward_std": 0.18828873336315155, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.056640625, "rewards/soft_format_reward/std": 0.23138070106506348, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3989.0, "completions/max_terminated_length": 3989.0, "completions/mean_length": 655.474609375, "completions/mean_terminated_length": 655.474609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.04885794552339074, "grad_norm": 0.7352936977408676, "learning_rate": 4.6153846153846156e-07, "loss": -0.0287, "num_tokens": 9297254.0, "reward": 0.0673828125, "reward_std": 0.16342398524284363, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.068359375, "rewards/soft_format_reward/std": 0.25260838866233826, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6753.0, "completions/max_terminated_length": 6753.0, "completions/mean_length": 654.60546875, "completions/mean_terminated_length": 654.60546875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05081226334432637, "grad_norm": 1.556772523178416, "learning_rate": 4.807692307692307e-07, "loss": 0.0177, "num_tokens": 9702892.0, "reward": 0.087890625, "reward_std": 0.2010476142168045, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.078125, "rewards/soft_format_reward/std": 0.26863065361976624, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6472.0, "completions/max_terminated_length": 6472.0, "completions/mean_length": 736.044921875, "completions/mean_terminated_length": 736.044921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.052766581165262, "grad_norm": 1.0582490484565645, "learning_rate": 5e-07, "loss": -0.0134, "num_tokens": 10148339.0, "reward": 0.1044921875, "reward_std": 0.20019401609897614, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.087890625, "rewards/soft_format_reward/std": 0.2834126651287079, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 5664.0, "completions/max_terminated_length": 5664.0, "completions/mean_length": 726.126953125, "completions/mean_terminated_length": 727.5479125976562, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.05472089898619763, "grad_norm": 2.857726402173463, "learning_rate": 5.192307692307692e-07, "loss": -0.0361, "num_tokens": 10592276.0, "reward": 0.1015625, "reward_std": 0.20877701044082642, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.12109375, "rewards/soft_format_reward/std": 0.3265552520751953, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 7747.0, "completions/max_terminated_length": 7747.0, "completions/mean_length": 732.8671875, "completions/mean_terminated_length": 735.7412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.056675216807133263, "grad_norm": 3.8611447120870404, "learning_rate": 5.384615384615384e-07, "loss": 0.0596, "num_tokens": 11039728.0, "reward": 0.1279296875, "reward_std": 0.2408362179994583, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.158203125, "rewards/soft_format_reward/std": 0.36528825759887695, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6036.0, "completions/max_terminated_length": 6036.0, "completions/mean_length": 728.291015625, "completions/mean_terminated_length": 728.291015625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.05862953462806889, "grad_norm": 6.140510129844352, "learning_rate": 5.576923076923077e-07, "loss": 0.0455, "num_tokens": 11479781.0, "reward": 0.1630859375, "reward_std": 0.25149115920066833, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.189453125, "rewards/soft_format_reward/std": 0.3922513723373413, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5456.0, "completions/max_terminated_length": 5456.0, "completions/mean_length": 623.748046875, "completions/mean_terminated_length": 623.748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06058385244900452, "grad_norm": 20.190316320088844, "learning_rate": 5.769230769230768e-07, "loss": 0.0265, "num_tokens": 11866708.0, "reward": 0.1708984375, "reward_std": 0.2567231357097626, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.208984375, "rewards/soft_format_reward/std": 0.40698084235191345, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5453.0, "completions/max_terminated_length": 5453.0, "completions/mean_length": 609.91015625, "completions/mean_terminated_length": 609.91015625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06253817026994014, "grad_norm": 9.473332784285086, "learning_rate": 5.961538461538461e-07, "loss": -0.0105, "num_tokens": 12249206.0, "reward": 0.2158203125, "reward_std": 0.3083522915840149, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.208984375, "rewards/soft_format_reward/std": 0.40698084235191345, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5294.0, "completions/max_terminated_length": 5294.0, "completions/mean_length": 664.775390625, "completions/mean_terminated_length": 664.775390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06449248809087578, "grad_norm": 10.829803763505799, "learning_rate": 6.153846153846154e-07, "loss": 0.0103, "num_tokens": 12661043.0, "reward": 0.1640625, "reward_std": 0.26412612199783325, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.20703125, "rewards/soft_format_reward/std": 0.40557438135147095, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7671.0, "completions/max_terminated_length": 7671.0, "completions/mean_length": 666.4140625, "completions/mean_terminated_length": 666.4140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0664468059118114, "grad_norm": 1.1974288910365694, "learning_rate": 6.346153846153845e-07, "loss": 0.0445, "num_tokens": 13073399.0, "reward": 0.2412109375, "reward_std": 0.33153459429740906, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.283203125, "rewards/soft_format_reward/std": 0.4509948492050171, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4454.0, "completions/max_terminated_length": 4454.0, "completions/mean_length": 571.095703125, "completions/mean_terminated_length": 571.095703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.06840112373274704, "grad_norm": 13.626751077969622, "learning_rate": 6.538461538461538e-07, "loss": 0.01, "num_tokens": 13440776.0, "reward": 0.224609375, "reward_std": 0.2904402017593384, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.3125, "rewards/soft_format_reward/std": 0.4639657139778137, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 640.6328125, "completions/mean_terminated_length": 641.886474609375, "completions/min_length": 0.0, "completions/min_terminated_length": 1.0, "epoch": 0.07035544155368267, "grad_norm": 1.819852103878187, "learning_rate": 6.730769230769231e-07, "loss": 0.0503, "num_tokens": 13846364.0, "reward": 0.2392578125, "reward_std": 0.311190664768219, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.384765625, "rewards/soft_format_reward/std": 0.4870156943798065, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6238.0, "completions/max_terminated_length": 6238.0, "completions/mean_length": 639.701171875, "completions/mean_terminated_length": 639.701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0723097593746183, "grad_norm": 1.4235130554682933, "learning_rate": 6.923076923076922e-07, "loss": 0.0002, "num_tokens": 14248707.0, "reward": 0.2451171875, "reward_std": 0.29699093103408813, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.423828125, "rewards/soft_format_reward/std": 0.4946470856666565, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7398.0, "completions/max_terminated_length": 7398.0, "completions/mean_length": 590.271484375, "completions/mean_terminated_length": 590.271484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.07426407719555393, "grad_norm": 0.628319132660879, "learning_rate": 7.115384615384616e-07, "loss": 0.0754, "num_tokens": 14623614.0, "reward": 0.29296875, "reward_std": 0.2848966419696808, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.484375, "rewards/soft_format_reward/std": 0.5002445578575134, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5965.0, "completions/max_terminated_length": 5965.0, "completions/mean_length": 613.828125, "completions/mean_terminated_length": 613.828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07621839501648955, "grad_norm": 0.7613404241291125, "learning_rate": 7.307692307692307e-07, "loss": 0.0549, "num_tokens": 15010438.0, "reward": 0.3310546875, "reward_std": 0.2927667796611786, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.572265625, "rewards/soft_format_reward/std": 0.4952339828014374, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4613.0, "completions/max_terminated_length": 4613.0, "completions/mean_length": 597.8828125, "completions/mean_terminated_length": 597.8828125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.07817271283742519, "grad_norm": 0.7429177868961824, "learning_rate": 7.5e-07, "loss": 0.0252, "num_tokens": 15389594.0, "reward": 0.40625, "reward_std": 0.3103194832801819, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.6953125, "rewards/soft_format_reward/std": 0.4607250988483429, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5187.0, "completions/max_terminated_length": 5187.0, "completions/mean_length": 574.287109375, "completions/mean_terminated_length": 574.287109375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.08012703065836081, "grad_norm": 29.45387609226835, "learning_rate": 7.692307692307693e-07, "loss": 0.0312, "num_tokens": 15757581.0, "reward": 0.435546875, "reward_std": 0.2659274637699127, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.76953125, "rewards/soft_format_reward/std": 0.42154473066329956, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 541.6171875, "completions/mean_terminated_length": 541.6171875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.08208134847929645, "grad_norm": 13.134675337038862, "learning_rate": 7.884615384615384e-07, "loss": 0.0486, "num_tokens": 16107305.0, "reward": 0.48046875, "reward_std": 0.26097944378852844, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.8125, "rewards/soft_format_reward/std": 0.39069411158561707, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6681.0, "completions/max_terminated_length": 6681.0, "completions/mean_length": 602.7421875, "completions/mean_terminated_length": 602.7421875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.08403566630023207, "grad_norm": 1.3047464546026393, "learning_rate": 8.076923076923077e-07, "loss": 0.0358, "num_tokens": 16486613.0, "reward": 0.453125, "reward_std": 0.23063214123249054, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.8359375, "rewards/soft_format_reward/std": 0.37069445848464966, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6393.0, "completions/max_terminated_length": 6393.0, "completions/mean_length": 585.7734375, "completions/mean_terminated_length": 585.7734375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.0859899841211677, "grad_norm": 1.0830243597441251, "learning_rate": 8.269230769230768e-07, "loss": 0.0811, "num_tokens": 16853809.0, "reward": 0.470703125, "reward_std": 0.1985776722431183, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.86328125, "rewards/soft_format_reward/std": 0.3438861668109894, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7895.0, "completions/max_terminated_length": 7895.0, "completions/mean_length": 609.8515625, "completions/mean_terminated_length": 609.8515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.08794430194210333, "grad_norm": 0.6775285892968461, "learning_rate": 8.461538461538461e-07, "loss": 0.0693, "num_tokens": 17235989.0, "reward": 0.4794921875, "reward_std": 0.20187821984291077, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.900390625, "rewards/soft_format_reward/std": 0.29977133870124817, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4606.0, "completions/max_terminated_length": 4606.0, "completions/mean_length": 584.8515625, "completions/mean_terminated_length": 584.8515625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.08989861976303896, "grad_norm": 1.1014489991571397, "learning_rate": 8.653846153846154e-07, "loss": 0.0795, "num_tokens": 17606137.0, "reward": 0.498046875, "reward_std": 0.2243758738040924, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.8984375, "rewards/soft_format_reward/std": 0.30236753821372986, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3147.0, "completions/max_terminated_length": 3147.0, "completions/mean_length": 544.201171875, "completions/mean_terminated_length": 544.201171875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0918529375839746, "grad_norm": 0.574106279906047, "learning_rate": 8.846153846153846e-07, "loss": 0.0314, "num_tokens": 17956080.0, "reward": 0.533203125, "reward_std": 0.22982193529605865, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.91015625, "rewards/soft_format_reward/std": 0.2862374484539032, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 5575.0, "completions/max_terminated_length": 5575.0, "completions/mean_length": 532.357421875, "completions/mean_terminated_length": 533.3992309570312, "completions/min_length": 0.0, "completions/min_terminated_length": 76.0, "epoch": 0.09380725540491022, "grad_norm": 0.6550073360929063, "learning_rate": 9.038461538461538e-07, "loss": 0.0482, "num_tokens": 18319511.0, "reward": 0.5029296875, "reward_std": 0.19548678398132324, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.927734375, "rewards/soft_format_reward/std": 0.2591804563999176, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4345.0, "completions/max_terminated_length": 4345.0, "completions/mean_length": 549.814453125, "completions/mean_terminated_length": 549.814453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.09576157322584586, "grad_norm": 0.5803676816838714, "learning_rate": 9.230769230769231e-07, "loss": 0.0097, "num_tokens": 18672504.0, "reward": 0.537109375, "reward_std": 0.15639463067054749, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.95703125, "rewards/soft_format_reward/std": 0.2029850035905838, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5540.0, "completions/max_terminated_length": 5540.0, "completions/mean_length": 479.1484375, "completions/mean_terminated_length": 479.1484375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.09771589104678148, "grad_norm": 0.481600628947521, "learning_rate": 9.423076923076923e-07, "loss": 0.0072, "num_tokens": 18988532.0, "reward": 0.5673828125, "reward_std": 0.19646382331848145, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.962890625, "rewards/soft_format_reward/std": 0.18921469151973724, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3445.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 571.2265625, "completions/mean_terminated_length": 571.2265625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.09967020886771712, "grad_norm": 0.4012053390386752, "learning_rate": 9.615384615384615e-07, "loss": 0.0531, "num_tokens": 19359016.0, "reward": 0.486328125, "reward_std": 0.08090324699878693, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9609375, "rewards/soft_format_reward/std": 0.1939331740140915, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 550.65234375, "completions/mean_terminated_length": 550.65234375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.10162452668865274, "grad_norm": 0.5384873921400348, "learning_rate": 9.807692307692306e-07, "loss": 0.0421, "num_tokens": 19731990.0, "reward": 0.51953125, "reward_std": 0.1248648464679718, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.96875, "rewards/soft_format_reward/std": 0.17416280508041382, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 499.494140625, "completions/mean_terminated_length": 500.47161865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.10357884450958837, "grad_norm": 0.498222527692459, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 20062755.0, "reward": 0.5810546875, "reward_std": 0.19981679320335388, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.962890625, "rewards/soft_format_reward/std": 0.18921469151973724, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 492.4609375, "completions/mean_terminated_length": 492.4609375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.105533162330524, "grad_norm": 0.5552275571581469, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 20397039.0, "reward": 0.583984375, "reward_std": 0.1560341715812683, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98828125, "rewards/soft_format_reward/std": 0.10772226005792618, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 564.458984375, "completions/mean_terminated_length": 564.458984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.10748748015145963, "grad_norm": 0.2624076614092068, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 20761898.0, "reward": 0.52734375, "reward_std": 0.06822281330823898, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98828125, "rewards/soft_format_reward/std": 0.10772226005792618, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 522.08203125, "completions/mean_terminated_length": 522.08203125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.10944179797239525, "grad_norm": 0.19130326711252885, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 21108452.0, "reward": 0.5283203125, "reward_std": 0.06760530173778534, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 502.4375, "completions/mean_terminated_length": 502.4375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.11139611579333089, "grad_norm": 0.2850231203751861, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 21448228.0, "reward": 0.529296875, "reward_std": 0.06024399399757385, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5151.0, "completions/max_terminated_length": 5151.0, "completions/mean_length": 564.33203125, "completions/mean_terminated_length": 564.33203125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.11335043361426653, "grad_norm": 0.3203274608081484, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 21823918.0, "reward": 0.509765625, "reward_std": 0.043847277760505676, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3359.0, "completions/max_terminated_length": 3359.0, "completions/mean_length": 575.369140625, "completions/mean_terminated_length": 575.369140625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.11530475143520215, "grad_norm": 0.22055660729223742, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 22203387.0, "reward": 0.5107421875, "reward_std": 0.043188292533159256, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 536.74609375, "completions/mean_terminated_length": 536.74609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.11725906925613779, "grad_norm": 0.872705985489336, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 22553513.0, "reward": 0.51171875, "reward_std": 0.046875, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 557.34375, "completions/mean_terminated_length": 557.34375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.11921338707707341, "grad_norm": 0.39900505592881763, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 22951465.0, "reward": 0.5546875, "reward_std": 0.11014671623706818, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 543.732421875, "completions/mean_terminated_length": 543.732421875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.12116770489800904, "grad_norm": 0.5615740045847875, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 23304432.0, "reward": 0.556640625, "reward_std": 0.1440330445766449, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 509.953125, "completions/mean_terminated_length": 509.953125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.12312202271894467, "grad_norm": 0.501055796908243, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 23625384.0, "reward": 0.544921875, "reward_std": 0.07823248207569122, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 520.38671875, "completions/mean_terminated_length": 520.38671875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1250763405398803, "grad_norm": 0.6171900301640806, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 23960798.0, "reward": 0.5634765625, "reward_std": 0.11427251994609833, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 545.3046875, "completions/mean_terminated_length": 545.3046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.12703065836081592, "grad_norm": 0.3948540084280699, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 24339754.0, "reward": 0.5498046875, "reward_std": 0.11211925745010376, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 494.08203125, "completions/mean_terminated_length": 494.08203125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.12898497618175156, "grad_norm": 0.3304401976843138, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 24677252.0, "reward": 0.572265625, "reward_std": 0.13781127333641052, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 541.107421875, "completions/mean_terminated_length": 541.107421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1309392940026872, "grad_norm": 0.24461956142234229, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 25029467.0, "reward": 0.5205078125, "reward_std": 0.060458000749349594, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 536.458984375, "completions/mean_terminated_length": 536.458984375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.1328936118236228, "grad_norm": 0.18539220344454985, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 25386966.0, "reward": 0.5146484375, "reward_std": 0.04230354726314545, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 530.16015625, "completions/mean_terminated_length": 530.16015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.13484792964455844, "grad_norm": 0.25468225730494143, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 25738152.0, "reward": 0.517578125, "reward_std": 0.043135739862918854, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 508.466796875, "completions/mean_terminated_length": 508.466796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.13680224746549408, "grad_norm": 0.20022401436907783, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 26060535.0, "reward": 0.513671875, "reward_std": 0.043847277760505676, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 537.802734375, "completions/mean_terminated_length": 537.802734375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.1387565652864297, "grad_norm": 0.2668852184808627, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 26416818.0, "reward": 0.5390625, "reward_std": 0.07992979884147644, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4278.0, "completions/max_terminated_length": 4278.0, "completions/mean_length": 562.83984375, "completions/mean_terminated_length": 562.83984375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.14071088310736535, "grad_norm": 0.2304930419413057, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 26772208.0, "reward": 0.5322265625, "reward_std": 0.0869225412607193, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 594.578125, "completions/mean_terminated_length": 594.578125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.14266520092830096, "grad_norm": 0.22724559534599176, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 27160920.0, "reward": 0.53125, "reward_std": 0.07702205330133438, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 578.845703125, "completions/mean_terminated_length": 578.845703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.1446195187492366, "grad_norm": 0.22287386551482907, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 27542313.0, "reward": 0.5166015625, "reward_std": 0.051993079483509064, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 613.80859375, "completions/mean_terminated_length": 613.80859375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.14657383657017223, "grad_norm": 0.18029464401038803, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 27934759.0, "reward": 0.5244140625, "reward_std": 0.048324182629585266, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3789.0, "completions/max_terminated_length": 3789.0, "completions/mean_length": 580.494140625, "completions/mean_terminated_length": 580.494140625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.14852815439110786, "grad_norm": 0.23367405275494105, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 28304788.0, "reward": 0.5107421875, "reward_std": 0.04472580552101135, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 662.54296875, "completions/mean_terminated_length": 662.54296875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.15048247221204347, "grad_norm": 0.15822988644136107, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 28722346.0, "reward": 0.5166015625, "reward_std": 0.047041989862918854, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 651.3359375, "completions/mean_terminated_length": 651.3359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1524367900329791, "grad_norm": 0.1356967673128265, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 29145574.0, "reward": 0.5078125, "reward_std": 0.02960042469203472, "rewards/accuracy_reward/mean": 0.009765625, "rewards/accuracy_reward/std": 0.09843364357948303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 649.826171875, "completions/mean_terminated_length": 649.826171875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15439110785391474, "grad_norm": 0.23821706835435685, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 29575085.0, "reward": 0.5078125, "reward_std": 0.03125, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4816.0, "completions/max_terminated_length": 4816.0, "completions/mean_length": 647.208984375, "completions/mean_terminated_length": 647.208984375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.15634542567485038, "grad_norm": 0.20539071002520867, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 29972664.0, "reward": 0.5126953125, "reward_std": 0.05446862801909447, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 580.908203125, "completions/mean_terminated_length": 580.908203125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.158299743495786, "grad_norm": 0.38231227027935133, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 30353817.0, "reward": 0.5732421875, "reward_std": 0.14383290708065033, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3727.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 590.91796875, "completions/mean_terminated_length": 590.91796875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.16025406131672162, "grad_norm": 0.2840205979252729, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 30747375.0, "reward": 0.6103515625, "reward_std": 0.1419982761144638, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 650.36328125, "completions/mean_terminated_length": 650.36328125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.16220837913765726, "grad_norm": 0.26607531904033477, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 31164505.0, "reward": 0.685546875, "reward_std": 0.18039385974407196, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 627.5546875, "completions/mean_terminated_length": 627.5546875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1641626969585929, "grad_norm": 0.4799261235792459, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 31566789.0, "reward": 0.58984375, "reward_std": 0.15521381795406342, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 589.341796875, "completions/mean_terminated_length": 589.341796875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.16611701477952853, "grad_norm": 0.35292376862513175, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 31952884.0, "reward": 0.630859375, "reward_std": 0.15488673746585846, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5012.0, "completions/max_terminated_length": 5012.0, "completions/mean_length": 627.216796875, "completions/mean_terminated_length": 627.216796875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.16807133260046414, "grad_norm": 0.2808525258157565, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 32358211.0, "reward": 0.6005859375, "reward_std": 0.11676256358623505, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5346.0, "completions/max_terminated_length": 5346.0, "completions/mean_length": 600.85546875, "completions/mean_terminated_length": 600.85546875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.17002565042139978, "grad_norm": 1.2788367609840963, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 32762441.0, "reward": 0.5771484375, "reward_std": 0.15553121268749237, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4114.0, "completions/max_terminated_length": 4114.0, "completions/mean_length": 583.478515625, "completions/mean_terminated_length": 583.478515625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1719799682423354, "grad_norm": 0.8381002401118538, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 33153966.0, "reward": 0.5673828125, "reward_std": 0.1311238408088684, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 561.716796875, "completions/mean_terminated_length": 561.716796875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.17393428606327105, "grad_norm": 1.2959332169759497, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 33502845.0, "reward": 0.5556640625, "reward_std": 0.11637574434280396, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4902.0, "completions/max_terminated_length": 4902.0, "completions/mean_length": 671.85546875, "completions/mean_terminated_length": 671.85546875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.17588860388420666, "grad_norm": 0.3142154309040994, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 33926915.0, "reward": 0.5625, "reward_std": 0.12823474407196045, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3444.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 608.5546875, "completions/mean_terminated_length": 608.5546875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.1778429217051423, "grad_norm": 0.3600101015470794, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 34302927.0, "reward": 0.5361328125, "reward_std": 0.09729446470737457, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3501.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 619.130859375, "completions/mean_terminated_length": 619.130859375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.17979723952607793, "grad_norm": 0.37543906173547886, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 34686082.0, "reward": 0.5576171875, "reward_std": 0.10678014904260635, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3983.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 631.79296875, "completions/mean_terminated_length": 631.79296875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.18175155734701356, "grad_norm": 0.34367370480737347, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 35081752.0, "reward": 0.5400390625, "reward_std": 0.08358919620513916, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3042.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 641.7734375, "completions/mean_terminated_length": 641.7734375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1837058751679492, "grad_norm": 0.16839045296922697, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 35492212.0, "reward": 0.521484375, "reward_std": 0.05099457502365112, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 653.001953125, "completions/mean_terminated_length": 653.001953125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1856601929888848, "grad_norm": 0.26168912129631455, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 35894165.0, "reward": 0.5625, "reward_std": 0.12186098843812943, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2100.0, "completions/max_terminated_length": 2100.0, "completions/mean_length": 558.08203125, "completions/mean_terminated_length": 558.08203125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.18761451080982045, "grad_norm": 0.2948149032225118, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 36251375.0, "reward": 0.5322265625, "reward_std": 0.06337852776050568, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 576.83984375, "completions/mean_terminated_length": 576.83984375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.18956882863075608, "grad_norm": 0.24383757507524517, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 36616573.0, "reward": 0.5625, "reward_std": 0.10904473811388016, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5066.0, "completions/max_terminated_length": 5066.0, "completions/mean_length": 659.486328125, "completions/mean_terminated_length": 659.486328125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.19152314645169172, "grad_norm": 0.1457015429228493, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 37019510.0, "reward": 0.53515625, "reward_std": 0.05931950733065605, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3219.0, "completions/max_terminated_length": 3219.0, "completions/mean_length": 654.630859375, "completions/mean_terminated_length": 654.630859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.19347746427262733, "grad_norm": 0.32090870702118257, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 37417945.0, "reward": 0.58984375, "reward_std": 0.15871897339820862, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4465.0, "completions/max_terminated_length": 4465.0, "completions/mean_length": 657.755859375, "completions/mean_terminated_length": 657.755859375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.19543178209356296, "grad_norm": 0.20341807021185154, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 37839564.0, "reward": 0.544921875, "reward_std": 0.08864613622426987, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 643.47265625, "completions/mean_terminated_length": 643.47265625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.1973860999144986, "grad_norm": 0.2535460898682244, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 38255278.0, "reward": 0.52734375, "reward_std": 0.06777782738208771, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 660.02734375, "completions/mean_terminated_length": 660.02734375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.19934041773543423, "grad_norm": 0.2392828254528082, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 38666508.0, "reward": 0.5888671875, "reward_std": 0.12386061251163483, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 695.3359375, "completions/mean_terminated_length": 695.3359375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.20129473555636984, "grad_norm": 0.2827856598070154, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 39105480.0, "reward": 0.59765625, "reward_std": 0.16757801175117493, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 693.658203125, "completions/mean_terminated_length": 693.658203125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.20324905337730548, "grad_norm": 0.27075802048695746, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 39541945.0, "reward": 0.640625, "reward_std": 0.20036140084266663, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 646.435546875, "completions/mean_terminated_length": 646.435546875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.2052033711982411, "grad_norm": 0.27514293915783544, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 39951528.0, "reward": 0.568359375, "reward_std": 0.1401737928390503, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 719.32421875, "completions/mean_terminated_length": 719.32421875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.20715768901917675, "grad_norm": 0.227750684574606, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 40405582.0, "reward": 0.580078125, "reward_std": 0.12753018736839294, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 691.31640625, "completions/mean_terminated_length": 691.31640625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.20911200684011239, "grad_norm": 0.2632670115755812, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 40849920.0, "reward": 0.591796875, "reward_std": 0.18596382439136505, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 687.16796875, "completions/mean_terminated_length": 687.16796875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.211066324661048, "grad_norm": 0.2255648801347303, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 41288214.0, "reward": 0.591796875, "reward_std": 0.1527780294418335, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 684.91796875, "completions/mean_terminated_length": 684.91796875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.21302064248198363, "grad_norm": 0.23856808511899583, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 41723484.0, "reward": 0.59375, "reward_std": 0.14524322748184204, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3242.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 741.443359375, "completions/mean_terminated_length": 741.443359375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.21497496030291927, "grad_norm": 0.26469393080648274, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 42177983.0, "reward": 0.701171875, "reward_std": 0.22451630234718323, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 706.369140625, "completions/mean_terminated_length": 706.369140625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2169292781238549, "grad_norm": 0.26485973865282497, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 42619340.0, "reward": 0.599609375, "reward_std": 0.1698872447013855, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 706.29296875, "completions/mean_terminated_length": 706.29296875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.2188835959447905, "grad_norm": 0.2913404975770583, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 43058418.0, "reward": 0.640625, "reward_std": 0.19215244054794312, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 664.96875, "completions/mean_terminated_length": 664.96875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.22083791376572615, "grad_norm": 0.15839005706172635, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 43480722.0, "reward": 0.525390625, "reward_std": 0.06354551017284393, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 640.341796875, "completions/mean_terminated_length": 640.341796875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.22279223158666178, "grad_norm": 0.2661420754364224, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 43889441.0, "reward": 0.580078125, "reward_std": 0.1470821499824524, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 682.2578125, "completions/mean_terminated_length": 682.2578125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.22474654940759742, "grad_norm": 0.2609150499556646, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 44318037.0, "reward": 0.681640625, "reward_std": 0.2350630760192871, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4330.0, "completions/max_terminated_length": 4330.0, "completions/mean_length": 731.0078125, "completions/mean_terminated_length": 731.0078125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.22670086722853305, "grad_norm": 0.24727089653306142, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 44781513.0, "reward": 0.6123046875, "reward_std": 0.14934487640857697, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 725.41015625, "completions/mean_terminated_length": 725.41015625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.22865518504946866, "grad_norm": 0.2556791325009635, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 45231483.0, "reward": 0.6044921875, "reward_std": 0.15568572282791138, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 767.5703125, "completions/mean_terminated_length": 767.5703125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.2306095028704043, "grad_norm": 0.2286798537485802, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 45717935.0, "reward": 0.5673828125, "reward_std": 0.14105787873268127, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 707.685546875, "completions/mean_terminated_length": 707.685546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.23256382069133993, "grad_norm": 0.23624585237603082, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 46166526.0, "reward": 0.607421875, "reward_std": 0.16086432337760925, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 638.669921875, "completions/mean_terminated_length": 638.669921875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.23451813851227557, "grad_norm": 0.2660519743635799, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 46572501.0, "reward": 0.5927734375, "reward_std": 0.13170786201953888, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3928.0, "completions/max_terminated_length": 3928.0, "completions/mean_length": 668.509765625, "completions/mean_terminated_length": 668.509765625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.23647245633321118, "grad_norm": 0.27160253111795785, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 46990122.0, "reward": 0.5810546875, "reward_std": 0.10540895164012909, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 643.935546875, "completions/mean_terminated_length": 643.935546875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.23842677415414681, "grad_norm": 0.2782906958717248, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 47412393.0, "reward": 0.599609375, "reward_std": 0.15294982492923737, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 656.884765625, "completions/mean_terminated_length": 656.884765625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.24038109197508245, "grad_norm": 0.2860229830755967, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 47830686.0, "reward": 0.634765625, "reward_std": 0.18557101488113403, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 699.51953125, "completions/mean_terminated_length": 699.51953125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2423354097960181, "grad_norm": 0.26407286026259613, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 48263288.0, "reward": 0.626953125, "reward_std": 0.20566867291927338, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 692.984375, "completions/mean_terminated_length": 692.984375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2442897276169537, "grad_norm": 0.24579712120050667, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 48693200.0, "reward": 0.576171875, "reward_std": 0.16433526575565338, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4677.0, "completions/max_terminated_length": 4677.0, "completions/mean_length": 724.22265625, "completions/mean_terminated_length": 724.22265625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.24624404543788933, "grad_norm": 0.27750163663134275, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 49137042.0, "reward": 0.591796875, "reward_std": 0.16855724155902863, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4488.0, "completions/max_terminated_length": 4488.0, "completions/mean_length": 687.873046875, "completions/mean_terminated_length": 687.873046875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.24819836325882497, "grad_norm": 0.326471684770648, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 49558257.0, "reward": 0.650390625, "reward_std": 0.2361946851015091, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 722.185546875, "completions/mean_terminated_length": 722.185546875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2501526810797606, "grad_norm": 0.2945388229120416, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 49996592.0, "reward": 0.623046875, "reward_std": 0.1910872906446457, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4334.0, "completions/max_terminated_length": 4334.0, "completions/mean_length": 736.427734375, "completions/mean_terminated_length": 736.427734375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2521069989006962, "grad_norm": 0.27809340109025193, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 50447643.0, "reward": 0.66796875, "reward_std": 0.20906388759613037, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4486.0, "completions/max_terminated_length": 4486.0, "completions/mean_length": 697.94140625, "completions/mean_terminated_length": 697.94140625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.25406131672163185, "grad_norm": 0.35023314790448384, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 50880269.0, "reward": 0.6220703125, "reward_std": 0.23224018514156342, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 688.73828125, "completions/mean_terminated_length": 688.73828125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.2560156345425675, "grad_norm": 0.2927254570333304, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 51313479.0, "reward": 0.568359375, "reward_std": 0.1609431505203247, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4300.0, "completions/max_terminated_length": 4300.0, "completions/mean_length": 718.095703125, "completions/mean_terminated_length": 718.095703125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2579699523635031, "grad_norm": 0.48962046110997975, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 51744200.0, "reward": 0.6240234375, "reward_std": 0.23233550786972046, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4426.0, "completions/max_terminated_length": 4426.0, "completions/mean_length": 681.51953125, "completions/mean_terminated_length": 681.51953125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.25992427018443875, "grad_norm": 0.2766999722832949, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 52156018.0, "reward": 0.6220703125, "reward_std": 0.22395655512809753, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2029.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 741.916015625, "completions/mean_terminated_length": 741.916015625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.2618785880053744, "grad_norm": 0.43387079617293317, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 52618487.0, "reward": 0.677734375, "reward_std": 0.23530298471450806, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4471.0, "completions/max_terminated_length": 4471.0, "completions/mean_length": 744.962890625, "completions/mean_terminated_length": 744.962890625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.26383290582631, "grad_norm": 1.3507532705943417, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 53062372.0, "reward": 0.5986328125, "reward_std": 0.2160569280385971, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 608.060546875, "completions/mean_terminated_length": 608.060546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.2657872236472456, "grad_norm": 0.6982954043889902, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 53440003.0, "reward": 0.681640625, "reward_std": 0.2599703073501587, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3369.0, "completions/max_terminated_length": 3369.0, "completions/mean_length": 623.02734375, "completions/mean_terminated_length": 623.02734375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.26774154146818124, "grad_norm": 0.40073052595287706, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 53821105.0, "reward": 0.7041015625, "reward_std": 0.29975640773773193, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 683.2265625, "completions/mean_terminated_length": 683.2265625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.2696958592891169, "grad_norm": 0.3030102675188629, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 54239397.0, "reward": 0.611328125, "reward_std": 0.20015983283519745, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 587.904296875, "completions/mean_terminated_length": 587.904296875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2716501771100525, "grad_norm": 0.3629849326417865, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 54601220.0, "reward": 0.71484375, "reward_std": 0.25802797079086304, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 633.720703125, "completions/mean_terminated_length": 633.720703125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.27360449493098815, "grad_norm": 0.34569759966181524, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 54989189.0, "reward": 0.640625, "reward_std": 0.23244047164916992, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4287.0, "completions/max_terminated_length": 4287.0, "completions/mean_length": 739.5390625, "completions/mean_terminated_length": 739.5390625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2755588127519238, "grad_norm": 0.28818740456046005, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 55429081.0, "reward": 0.6943359375, "reward_std": 0.22556185722351074, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3252.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 691.375, "completions/mean_terminated_length": 691.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2775131305728594, "grad_norm": 0.272315442375954, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 55845545.0, "reward": 0.6103515625, "reward_std": 0.16950386762619019, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3698.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 713.0546875, "completions/mean_terminated_length": 713.0546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.27946744839379506, "grad_norm": 0.2632114845336422, "learning_rate": 1e-06, "loss": 0.0174, "num_tokens": 56284789.0, "reward": 0.6494140625, "reward_std": 0.17788583040237427, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 679.162109375, "completions/mean_terminated_length": 679.162109375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2814217662147307, "grad_norm": 0.29020462850988193, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 56700440.0, "reward": 0.62890625, "reward_std": 0.11030054092407227, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 759.43359375, "completions/mean_terminated_length": 759.43359375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2833760840356663, "grad_norm": 0.17974918127783202, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 57159014.0, "reward": 0.56640625, "reward_std": 0.12247820198535919, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 692.119140625, "completions/mean_terminated_length": 692.119140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2853304018566019, "grad_norm": 0.2431924521653994, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 57580803.0, "reward": 0.572265625, "reward_std": 0.14612169563770294, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4290.0, "completions/max_terminated_length": 4290.0, "completions/mean_length": 670.27734375, "completions/mean_terminated_length": 670.27734375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.28728471967753755, "grad_norm": 0.3488174165532187, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 57983521.0, "reward": 0.6708984375, "reward_std": 0.2069774866104126, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 742.1328125, "completions/mean_terminated_length": 742.1328125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2892390374984732, "grad_norm": 0.24031256742591312, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 58428325.0, "reward": 0.638671875, "reward_std": 0.19146013259887695, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 710.646484375, "completions/mean_terminated_length": 710.646484375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2911933553194088, "grad_norm": 0.2589578742432614, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 58863264.0, "reward": 0.689453125, "reward_std": 0.18030090630054474, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3338.0, "completions/max_terminated_length": 3338.0, "completions/mean_length": 689.119140625, "completions/mean_terminated_length": 689.119140625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.29314767314034446, "grad_norm": 0.31023940245693943, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 59282013.0, "reward": 0.6826171875, "reward_std": 0.19207939505577087, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4099.0, "completions/max_terminated_length": 4099.0, "completions/mean_length": 815.75, "completions/mean_terminated_length": 815.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2951019909612801, "grad_norm": 0.24229591833709058, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 59768989.0, "reward": 0.69921875, "reward_std": 0.2111576944589615, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4529.0, "completions/max_terminated_length": 4529.0, "completions/mean_length": 770.59765625, "completions/mean_terminated_length": 770.59765625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.29705630878221573, "grad_norm": 0.23041197635858718, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 60246655.0, "reward": 0.669921875, "reward_std": 0.14793451130390167, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3552.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 802.4453125, "completions/mean_terminated_length": 802.4453125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.29901062660315136, "grad_norm": 0.23297876350421173, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 60737779.0, "reward": 0.6806640625, "reward_std": 0.14777296781539917, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 758.3046875, "completions/mean_terminated_length": 758.3046875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.30096494442408694, "grad_norm": 0.26254209084339436, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 61212575.0, "reward": 0.65234375, "reward_std": 0.17155036330223083, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 744.447265625, "completions/mean_terminated_length": 744.447265625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.3029192622450226, "grad_norm": 0.20358568099423952, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 61662388.0, "reward": 0.56640625, "reward_std": 0.08109388500452042, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 811.734375, "completions/mean_terminated_length": 811.734375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.3048735800659582, "grad_norm": 0.21664905219307753, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 62164380.0, "reward": 0.6484375, "reward_std": 0.1361752450466156, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 683.7109375, "completions/mean_terminated_length": 683.7109375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.30682789788689385, "grad_norm": 0.37650908881308803, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 62618920.0, "reward": 0.767578125, "reward_std": 0.2620534896850586, "rewards/accuracy_reward/mean": 0.267578125, "rewards/accuracy_reward/std": 0.4431293308734894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 752.37890625, "completions/mean_terminated_length": 752.37890625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3087822157078295, "grad_norm": 0.24518760312034588, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 63077690.0, "reward": 0.66796875, "reward_std": 0.15783661603927612, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 748.009765625, "completions/mean_terminated_length": 748.009765625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.3107365335287651, "grad_norm": 0.20605205942354404, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 63527839.0, "reward": 0.62109375, "reward_std": 0.17718049883842468, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 747.109375, "completions/mean_terminated_length": 747.109375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.31269085134970076, "grad_norm": 0.2679095070290706, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 63975655.0, "reward": 0.6484375, "reward_std": 0.24407950043678284, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 812.529296875, "completions/mean_terminated_length": 812.529296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.3146451691706364, "grad_norm": 0.24193586431650785, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 64471622.0, "reward": 0.6953125, "reward_std": 0.22490081191062927, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 854.40234375, "completions/mean_terminated_length": 854.40234375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.316599486991572, "grad_norm": 0.2317553138327133, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 64993460.0, "reward": 0.619140625, "reward_std": 0.16367799043655396, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 777.09375, "completions/mean_terminated_length": 778.614501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.3185538048125076, "grad_norm": 0.24628709207337093, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 65482452.0, "reward": 0.7392578125, "reward_std": 0.24229753017425537, "rewards/accuracy_reward/mean": 0.240234375, "rewards/accuracy_reward/std": 0.4276435375213623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 813.26171875, "completions/mean_terminated_length": 813.26171875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.32050812263344325, "grad_norm": 0.2619314014933818, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 65968538.0, "reward": 0.658203125, "reward_std": 0.20367062091827393, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3080.0, "completions/max_terminated_length": 3080.0, "completions/mean_length": 774.814453125, "completions/mean_terminated_length": 774.814453125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.3224624404543789, "grad_norm": 0.25309168218127176, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 66451115.0, "reward": 0.6220703125, "reward_std": 0.16638079285621643, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4065.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 790.060546875, "completions/mean_terminated_length": 790.060546875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.3244167582753145, "grad_norm": 0.2963651499207953, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 66925770.0, "reward": 0.74609375, "reward_std": 0.30336713790893555, "rewards/accuracy_reward/mean": 0.248046875, "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2770.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 764.955078125, "completions/mean_terminated_length": 764.955078125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.32637107609625016, "grad_norm": 0.3205316675698082, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 67382307.0, "reward": 0.736328125, "reward_std": 0.35904234647750854, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.42644867300987244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3767.0, "completions/max_terminated_length": 3767.0, "completions/mean_length": 780.142578125, "completions/mean_terminated_length": 780.142578125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3283253939171858, "grad_norm": 0.31422935388243467, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 67847356.0, "reward": 0.7294921875, "reward_std": 0.2578871250152588, "rewards/accuracy_reward/mean": 0.232421875, "rewards/accuracy_reward/std": 0.42278963327407837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3635.0, "completions/max_terminated_length": 3635.0, "completions/mean_length": 789.384765625, "completions/mean_terminated_length": 789.384765625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.33027971173812143, "grad_norm": 0.2763591871083539, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 68315777.0, "reward": 0.6640625, "reward_std": 0.2283334732055664, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 746.04296875, "completions/mean_terminated_length": 746.04296875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.33223402955905706, "grad_norm": 0.22889079270457433, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 68784375.0, "reward": 0.6220703125, "reward_std": 0.15298479795455933, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 740.236328125, "completions/mean_terminated_length": 740.236328125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.33418834737999265, "grad_norm": 0.2400226594585389, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 69230560.0, "reward": 0.623046875, "reward_std": 0.13837136328220367, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 798.46484375, "completions/mean_terminated_length": 798.46484375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3361426652009283, "grad_norm": 0.21283379297888078, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 69712606.0, "reward": 0.6533203125, "reward_std": 0.13678552210330963, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 781.80859375, "completions/mean_terminated_length": 781.80859375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3380969830218639, "grad_norm": 0.24719869366609729, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 70180108.0, "reward": 0.650390625, "reward_std": 0.2241959273815155, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4074.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 748.6640625, "completions/mean_terminated_length": 748.6640625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.34005130084279955, "grad_norm": 0.25610581415426187, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 70632432.0, "reward": 0.634765625, "reward_std": 0.16960762441158295, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3302.0, "completions/max_terminated_length": 3302.0, "completions/mean_length": 745.935546875, "completions/mean_terminated_length": 745.935546875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.3420056186637352, "grad_norm": 0.26869305689085277, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 71082911.0, "reward": 0.646484375, "reward_std": 0.1600971221923828, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 757.15625, "completions/mean_terminated_length": 757.15625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.3439599364846708, "grad_norm": 0.25912471509203233, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 71547023.0, "reward": 0.685546875, "reward_std": 0.18943729996681213, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4423.0, "completions/max_terminated_length": 4423.0, "completions/mean_length": 845.38671875, "completions/mean_terminated_length": 845.38671875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.34591425430560646, "grad_norm": 0.27027972145066304, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 72061349.0, "reward": 0.6474609375, "reward_std": 0.21319273114204407, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 712.73046875, "completions/mean_terminated_length": 712.73046875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3478685721265421, "grad_norm": 0.3427869494899167, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 72500203.0, "reward": 0.7109375, "reward_std": 0.2330913245677948, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4155.0, "completions/max_terminated_length": 4155.0, "completions/mean_length": 799.75, "completions/mean_terminated_length": 799.75, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.34982288994747773, "grad_norm": 0.25141299436628006, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 72986843.0, "reward": 0.6064453125, "reward_std": 0.1697743833065033, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3389.0, "completions/max_terminated_length": 3389.0, "completions/mean_length": 779.1171875, "completions/mean_terminated_length": 779.1171875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3517772077684133, "grad_norm": 0.29349116949044435, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 73451527.0, "reward": 0.720703125, "reward_std": 0.26464205980300903, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3525.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 739.443359375, "completions/mean_terminated_length": 739.443359375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.35373152558934895, "grad_norm": 0.3240617719772287, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 73910714.0, "reward": 0.802734375, "reward_std": 0.31746405363082886, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.4607250988483429, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3310.0, "completions/max_terminated_length": 3310.0, "completions/mean_length": 804.076171875, "completions/mean_terminated_length": 804.076171875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.3556858434102846, "grad_norm": 0.23830613092555628, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 74379601.0, "reward": 0.623046875, "reward_std": 0.19262534379959106, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 851.955078125, "completions/mean_terminated_length": 851.955078125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.3576401612312202, "grad_norm": 0.26190342676179346, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 74893274.0, "reward": 0.75, "reward_std": 0.22990617156028748, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.43343618512153625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 862.33984375, "completions/mean_terminated_length": 862.33984375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.35959447905215586, "grad_norm": 0.2632879398895767, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 75407912.0, "reward": 0.638671875, "reward_std": 0.17609556019306183, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 820.552734375, "completions/mean_terminated_length": 820.552734375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.3615487968730915, "grad_norm": 0.2863485310277067, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 75898611.0, "reward": 0.7080078125, "reward_std": 0.25882139801979065, "rewards/accuracy_reward/mean": 0.208984375, "rewards/accuracy_reward/std": 0.40698084235191345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4223.0, "completions/max_terminated_length": 4223.0, "completions/mean_length": 814.423828125, "completions/mean_terminated_length": 814.423828125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.36350311469402713, "grad_norm": 0.2622039048191849, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 76386588.0, "reward": 0.5869140625, "reward_std": 0.18172740936279297, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 829.587890625, "completions/mean_terminated_length": 829.587890625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.36545743251496277, "grad_norm": 0.2732455139111148, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 76886505.0, "reward": 0.689453125, "reward_std": 0.23466821014881134, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4208.0, "completions/max_terminated_length": 4208.0, "completions/mean_length": 856.87109375, "completions/mean_terminated_length": 856.87109375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3674117503358984, "grad_norm": 0.2623055000011323, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 77400583.0, "reward": 0.7314453125, "reward_std": 0.2852405905723572, "rewards/accuracy_reward/mean": 0.232421875, "rewards/accuracy_reward/std": 0.42278963327407837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5640.0, "completions/max_terminated_length": 5640.0, "completions/mean_length": 853.30859375, "completions/mean_terminated_length": 853.30859375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.369366068156834, "grad_norm": 0.3573629765618696, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 77902117.0, "reward": 0.7265625, "reward_std": 0.2968667149543762, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.984375, "rewards/soft_format_reward/std": 0.12414088100194931, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3075.0, "completions/max_terminated_length": 3075.0, "completions/mean_length": 899.916015625, "completions/mean_terminated_length": 899.916015625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.3713203859777696, "grad_norm": 0.19173275081974, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 78430490.0, "reward": 0.685546875, "reward_std": 0.1747143268585205, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3782.0, "completions/max_terminated_length": 3782.0, "completions/mean_length": 955.080078125, "completions/mean_terminated_length": 955.080078125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.37327470379870525, "grad_norm": 0.23793975238468773, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 78984067.0, "reward": 0.6748046875, "reward_std": 0.2005995362997055, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3498.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 866.150390625, "completions/mean_terminated_length": 866.150390625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.3752290216196409, "grad_norm": 0.31379268532281196, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 79499888.0, "reward": 0.724609375, "reward_std": 0.19679194688796997, "rewards/accuracy_reward/mean": 0.228515625, "rewards/accuracy_reward/std": 0.4202871024608612, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2597.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 878.51953125, "completions/mean_terminated_length": 878.51953125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.3771833394405765, "grad_norm": 0.24394454897717519, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 80015866.0, "reward": 0.6806640625, "reward_std": 0.18255063891410828, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3530.0, "completions/max_terminated_length": 3530.0, "completions/mean_length": 909.580078125, "completions/mean_terminated_length": 909.580078125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.37913765726151216, "grad_norm": 0.18040807657100863, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 80542451.0, "reward": 0.626953125, "reward_std": 0.13550108671188354, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3397.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 954.884765625, "completions/mean_terminated_length": 954.884765625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.3810919750824478, "grad_norm": 0.22170464927593608, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 81102424.0, "reward": 0.65234375, "reward_std": 0.15196657180786133, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2168.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 908.2578125, "completions/mean_terminated_length": 908.2578125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.38304629290338343, "grad_norm": 0.23600614941725273, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 81646428.0, "reward": 0.68359375, "reward_std": 0.20152875781059265, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4084.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 945.404296875, "completions/mean_terminated_length": 945.404296875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.38500061072431907, "grad_norm": 0.21988431449537316, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 82207227.0, "reward": 0.8193359375, "reward_std": 0.17495843768119812, "rewards/accuracy_reward/mean": 0.3203125, "rewards/accuracy_reward/std": 0.4670529365539551, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3764.0, "completions/max_terminated_length": 3764.0, "completions/mean_length": 931.064453125, "completions/mean_terminated_length": 931.064453125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.38695492854525465, "grad_norm": 0.25995575663674936, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 82754828.0, "reward": 0.669921875, "reward_std": 0.21521224081516266, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 884.419921875, "completions/mean_terminated_length": 884.419921875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.3889092463661903, "grad_norm": 0.26268468182790095, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 83296547.0, "reward": 0.572265625, "reward_std": 0.14881780743598938, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 955.845703125, "completions/mean_terminated_length": 955.845703125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.3908635641871259, "grad_norm": 0.2971001211592783, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 83874180.0, "reward": 0.734375, "reward_std": 0.26208168268203735, "rewards/accuracy_reward/mean": 0.236328125, "rewards/accuracy_reward/std": 0.42524150013923645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3982.0, "completions/max_terminated_length": 3982.0, "completions/mean_length": 915.42578125, "completions/mean_terminated_length": 915.42578125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.39281788200806156, "grad_norm": 0.2612289532160619, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 84414638.0, "reward": 0.71875, "reward_std": 0.22146812081336975, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 952.732421875, "completions/mean_terminated_length": 952.732421875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.3947721998289972, "grad_norm": 0.21818012447386279, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 84968773.0, "reward": 0.66015625, "reward_std": 0.16679157316684723, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 987.447265625, "completions/mean_terminated_length": 987.447265625, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.39672651764993283, "grad_norm": 0.2513367951611963, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 85548490.0, "reward": 0.615234375, "reward_std": 0.1888715773820877, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 1065.8203125, "completions/mean_terminated_length": 1065.8203125, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.39868083547086847, "grad_norm": 0.2557963082324739, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 86170270.0, "reward": 0.666015625, "reward_std": 0.23651830852031708, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4625.0, "completions/max_terminated_length": 4625.0, "completions/mean_length": 945.232421875, "completions/mean_terminated_length": 945.232421875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4006351532918041, "grad_norm": 0.30834471563398924, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 86720965.0, "reward": 0.7470703125, "reward_std": 0.246811181306839, "rewards/accuracy_reward/mean": 0.248046875, "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 956.16796875, "completions/mean_terminated_length": 956.16796875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.4025894711127397, "grad_norm": 0.27542187853912603, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 87281467.0, "reward": 0.75, "reward_std": 0.29134687781333923, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.43343618512153625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4110.0, "completions/max_terminated_length": 4110.0, "completions/mean_length": 942.583984375, "completions/mean_terminated_length": 942.583984375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.4045437889336753, "grad_norm": 0.30443486724056545, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 87835382.0, "reward": 0.8212890625, "reward_std": 0.3201446235179901, "rewards/accuracy_reward/mean": 0.322265625, "rewards/accuracy_reward/std": 0.46780112385749817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 827.787109375, "completions/mean_terminated_length": 827.787109375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.40649810675461095, "grad_norm": 0.32313505262864856, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 88322185.0, "reward": 0.810546875, "reward_std": 0.25759023427963257, "rewards/accuracy_reward/mean": 0.310546875, "rewards/accuracy_reward/std": 0.46317005157470703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4301.0, "completions/max_terminated_length": 4301.0, "completions/mean_length": 1033.625, "completions/mean_terminated_length": 1033.625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.4084524245755466, "grad_norm": 0.18325104023038696, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 88916201.0, "reward": 0.734375, "reward_std": 0.14592814445495605, "rewards/accuracy_reward/mean": 0.236328125, "rewards/accuracy_reward/std": 0.42524150013923645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3938.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 921.39453125, "completions/mean_terminated_length": 921.39453125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.4104067423964822, "grad_norm": 0.28524825116918173, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 89451411.0, "reward": 0.7763671875, "reward_std": 0.2564077377319336, "rewards/accuracy_reward/mean": 0.27734375, "rewards/accuracy_reward/std": 0.4481254518032074, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4314.0, "completions/max_terminated_length": 4314.0, "completions/mean_length": 1045.392578125, "completions/mean_terminated_length": 1045.392578125, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.41236106021741786, "grad_norm": 0.19271881109079272, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 90058780.0, "reward": 0.70703125, "reward_std": 0.1599920094013214, "rewards/accuracy_reward/mean": 0.208984375, "rewards/accuracy_reward/std": 0.40698084235191345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4420.0, "completions/max_terminated_length": 4420.0, "completions/mean_length": 971.927734375, "completions/mean_terminated_length": 971.927734375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.4143153780383535, "grad_norm": 0.2600337313478683, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 90616535.0, "reward": 0.6455078125, "reward_std": 0.21474801003932953, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4103.0, "completions/max_terminated_length": 4103.0, "completions/mean_length": 1011.65234375, "completions/mean_terminated_length": 1011.65234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.41626969585928913, "grad_norm": 0.21847161709190585, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 91199557.0, "reward": 0.716796875, "reward_std": 0.2477385401725769, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3865.0, "completions/max_terminated_length": 3865.0, "completions/mean_length": 965.220703125, "completions/mean_terminated_length": 965.220703125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.41822401368022477, "grad_norm": 0.22320366682605963, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 91765190.0, "reward": 0.7041015625, "reward_std": 0.2076844871044159, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4404.0, "completions/max_terminated_length": 4404.0, "completions/mean_length": 1008.40625, "completions/mean_terminated_length": 1010.379638671875, "completions/min_length": 0.0, "completions/min_terminated_length": 322.0, "epoch": 0.42017833150116035, "grad_norm": 0.28569975603493336, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 92359750.0, "reward": 0.75390625, "reward_std": 0.30729252099990845, "rewards/accuracy_reward/mean": 0.259765625, "rewards/accuracy_reward/std": 0.4389347732067108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98828125, "rewards/soft_format_reward/std": 0.10772226005792618, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 914.80859375, "completions/mean_terminated_length": 914.80859375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.422132649322096, "grad_norm": 0.23027800733494036, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 92907172.0, "reward": 0.7529296875, "reward_std": 0.1622444987297058, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.43567025661468506, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4533.0, "completions/max_terminated_length": 4533.0, "completions/mean_length": 942.66796875, "completions/mean_terminated_length": 942.66796875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.4240869671430316, "grad_norm": 0.2574145708092522, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 93459210.0, "reward": 0.7529296875, "reward_std": 0.22094133496284485, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.43567025661468506, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 1006.265625, "completions/mean_terminated_length": 1006.265625, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.42604128496396726, "grad_norm": 0.17045379685947146, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 94039442.0, "reward": 0.6181640625, "reward_std": 0.12891486287117004, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2209.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 970.173828125, "completions/mean_terminated_length": 970.173828125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4279956027849029, "grad_norm": 0.26850685850054384, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 94612219.0, "reward": 0.78125, "reward_std": 0.3012031018733978, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45004892349243164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3581.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 1007.35546875, "completions/mean_terminated_length": 1007.35546875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.42994992060583853, "grad_norm": 0.26864480682396996, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 95203873.0, "reward": 0.8642578125, "reward_std": 0.3184603750705719, "rewards/accuracy_reward/mean": 0.365234375, "rewards/accuracy_reward/std": 0.4819667339324951, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3821.0, "completions/max_terminated_length": 3821.0, "completions/mean_length": 1079.130859375, "completions/mean_terminated_length": 1079.130859375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.43190423842677417, "grad_norm": 0.16386604992074785, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 95823764.0, "reward": 0.6923828125, "reward_std": 0.161734938621521, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 989.015625, "completions/mean_terminated_length": 989.015625, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.4338585562477098, "grad_norm": 0.20953964134020223, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 96399212.0, "reward": 0.7734375, "reward_std": 0.21356725692749023, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.4461594223976135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3732.0, "completions/max_terminated_length": 3732.0, "completions/mean_length": 1096.54296875, "completions/mean_terminated_length": 1096.54296875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.43581287406864544, "grad_norm": 0.19019372144990224, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 97034434.0, "reward": 0.6591796875, "reward_std": 0.1866757571697235, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 1116.783203125, "completions/mean_terminated_length": 1116.783203125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.437767191889581, "grad_norm": 0.18315269162731226, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 97681667.0, "reward": 0.71484375, "reward_std": 0.2196236550807953, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 849.51171875, "completions/mean_terminated_length": 849.51171875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.43972150971051666, "grad_norm": 0.24948571472846473, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 98184201.0, "reward": 0.7734375, "reward_std": 0.24267366528511047, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.4461594223976135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 946.234375, "completions/mean_terminated_length": 946.234375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.4416758275314523, "grad_norm": 0.24152675899148837, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 98734961.0, "reward": 0.716796875, "reward_std": 0.26917481422424316, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 948.015625, "completions/mean_terminated_length": 948.015625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.44363014535238793, "grad_norm": 0.22545582726917313, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 99281865.0, "reward": 0.830078125, "reward_std": 0.3249583840370178, "rewards/accuracy_reward/mean": 0.330078125, "rewards/accuracy_reward/std": 0.47070086002349854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4594.0, "completions/max_terminated_length": 4594.0, "completions/mean_length": 1066.33203125, "completions/mean_terminated_length": 1066.33203125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.44558446317332356, "grad_norm": 0.2096555377981446, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 99890819.0, "reward": 0.7158203125, "reward_std": 0.25165998935699463, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.978515625, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3889.0, "completions/max_terminated_length": 3889.0, "completions/mean_length": 1005.283203125, "completions/mean_terminated_length": 1005.283203125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.4475387809942592, "grad_norm": 0.19251257786284004, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 100468452.0, "reward": 0.720703125, "reward_std": 0.24990171194076538, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41643625497817993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3115.0, "completions/max_terminated_length": 3115.0, "completions/mean_length": 939.892578125, "completions/mean_terminated_length": 939.892578125, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.44949309881519484, "grad_norm": 0.2359322377049206, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 101013965.0, "reward": 0.7138671875, "reward_std": 0.22999022901058197, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4053.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 984.09375, "completions/mean_terminated_length": 984.09375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.45144741663613047, "grad_norm": 0.2865513700141309, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 101596893.0, "reward": 0.681640625, "reward_std": 0.162692129611969, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3125.0, "completions/max_terminated_length": 3125.0, "completions/mean_length": 972.630859375, "completions/mean_terminated_length": 972.630859375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.4534017344570661, "grad_norm": 0.1706451353142778, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 102160656.0, "reward": 0.7021484375, "reward_std": 0.159878671169281, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4431.0, "completions/max_terminated_length": 4431.0, "completions/mean_length": 1023.845703125, "completions/mean_terminated_length": 1023.845703125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.4553560522780017, "grad_norm": 0.14864571037065305, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 102750353.0, "reward": 0.60546875, "reward_std": 0.16349893808364868, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3224.0, "completions/max_terminated_length": 3224.0, "completions/mean_length": 942.154296875, "completions/mean_terminated_length": 942.154296875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.4573103700989373, "grad_norm": 0.18473669788538932, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 103300384.0, "reward": 0.658203125, "reward_std": 0.17805281281471252, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3319.0, "completions/max_terminated_length": 3319.0, "completions/mean_length": 978.4921875, "completions/mean_terminated_length": 978.4921875, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.45926468791987296, "grad_norm": 0.19895438933076293, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 103866876.0, "reward": 0.6337890625, "reward_std": 0.1871449053287506, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3574.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 1034.4453125, "completions/mean_terminated_length": 1034.4453125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.4612190057408086, "grad_norm": 0.19854174163197164, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 104482608.0, "reward": 0.7099609375, "reward_std": 0.22323226928710938, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2528.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 1098.18359375, "completions/mean_terminated_length": 1098.18359375, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.46317332356174423, "grad_norm": 0.1821806828854234, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 105137966.0, "reward": 0.7431640625, "reward_std": 0.26687362790107727, "rewards/accuracy_reward/mean": 0.244140625, "rewards/accuracy_reward/std": 0.42999663949012756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3657.0, "completions/max_terminated_length": 3657.0, "completions/mean_length": 978.833984375, "completions/mean_terminated_length": 978.833984375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.46512764138267987, "grad_norm": 0.20262259753735837, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 105717225.0, "reward": 0.69921875, "reward_std": 0.1814504861831665, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5425.0, "completions/max_terminated_length": 5425.0, "completions/mean_length": 1133.865234375, "completions/mean_terminated_length": 1133.865234375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.4670819592036155, "grad_norm": 0.1699007476515997, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 106362132.0, "reward": 0.701171875, "reward_std": 0.21961811184883118, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4108.0, "completions/max_terminated_length": 4108.0, "completions/mean_length": 951.15625, "completions/mean_terminated_length": 953.017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.46903627702455114, "grad_norm": 0.29621199324090425, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 106923012.0, "reward": 0.79296875, "reward_std": 0.31659504771232605, "rewards/accuracy_reward/mean": 0.302734375, "rewards/accuracy_reward/std": 0.45989060401916504, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5658.0, "completions/max_terminated_length": 5658.0, "completions/mean_length": 1088.650390625, "completions/mean_terminated_length": 1088.650390625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.4709905948454867, "grad_norm": 0.25561270272714015, "learning_rate": 1e-06, "loss": 0.0275, "num_tokens": 107561265.0, "reward": 0.7373046875, "reward_std": 0.2741406559944153, "rewards/accuracy_reward/mean": 0.248046875, "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.978515625, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3788.0, "completions/max_terminated_length": 3788.0, "completions/mean_length": 1010.59375, "completions/mean_terminated_length": 1014.5569458007812, "completions/min_length": 0.0, "completions/min_terminated_length": 297.0, "epoch": 0.47294491266642236, "grad_norm": 0.2485195300084967, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 108146161.0, "reward": 0.7236328125, "reward_std": 0.23151344060897827, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4851.0, "completions/max_terminated_length": 4851.0, "completions/mean_length": 1111.775390625, "completions/mean_terminated_length": 1116.1353759765625, "completions/min_length": 0.0, "completions/min_terminated_length": 379.0, "epoch": 0.474899230487358, "grad_norm": 0.2021770539335826, "learning_rate": 1e-06, "loss": -0.014, "num_tokens": 108785758.0, "reward": 0.7080078125, "reward_std": 0.26883962750434875, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 7216.0, "completions/max_terminated_length": 7216.0, "completions/mean_length": 1114.83984375, "completions/mean_terminated_length": 1117.021484375, "completions/min_length": 0.0, "completions/min_terminated_length": 572.0, "epoch": 0.47685354830829363, "grad_norm": 0.25711707470165174, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 109440076.0, "reward": 0.71484375, "reward_std": 0.2208394706249237, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 7844.0, "completions/max_terminated_length": 7844.0, "completions/mean_length": 1116.5625, "completions/mean_terminated_length": 1166.69384765625, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.47880786612922926, "grad_norm": 0.31192577891301276, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 110091964.0, "reward": 0.6650390625, "reward_std": 0.2845654785633087, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.982421875, "rewards/soft_format_reward/std": 0.13154059648513794, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 7687.0, "completions/max_terminated_length": 7687.0, "completions/mean_length": 1083.162109375, "completions/mean_terminated_length": 1205.6064453125, "completions/min_length": 0.0, "completions/min_terminated_length": 451.0, "epoch": 0.4807621839501649, "grad_norm": 0.3150070132679766, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 110719583.0, "reward": 0.724609375, "reward_std": 0.2723310887813568, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 8160.0, "completions/max_terminated_length": 8160.0, "completions/mean_length": 1227.6328125, "completions/mean_terminated_length": 1454.9722900390625, "completions/min_length": 0.0, "completions/min_terminated_length": 498.0, "epoch": 0.48271650177110054, "grad_norm": 0.24007897809165102, "learning_rate": 1e-06, "loss": 0.08, "num_tokens": 111427715.0, "reward": 0.7412109375, "reward_std": 0.3292117714881897, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43785804510116577, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.966796875, "rewards/soft_format_reward/std": 0.17934183776378632, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.228515625, "completions/max_length": 7801.0, "completions/max_terminated_length": 7801.0, "completions/mean_length": 1047.724609375, "completions/mean_terminated_length": 1358.0633544921875, "completions/min_length": 0.0, "completions/min_terminated_length": 494.0, "epoch": 0.4846708195920362, "grad_norm": 0.3638344282061002, "learning_rate": 1e-06, "loss": 0.0748, "num_tokens": 112034038.0, "reward": 0.69921875, "reward_std": 0.3243100941181183, "rewards/accuracy_reward/mean": 0.224609375, "rewards/accuracy_reward/std": 0.41773295402526855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.94921875, "rewards/soft_format_reward/std": 0.21976542472839355, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.173828125, "completions/max_length": 7779.0, "completions/max_terminated_length": 7779.0, "completions/mean_length": 1090.521484375, "completions/mean_terminated_length": 1319.96923828125, "completions/min_length": 0.0, "completions/min_terminated_length": 343.0, "epoch": 0.4866251374129718, "grad_norm": 0.31731663593912496, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 112654273.0, "reward": 0.818359375, "reward_std": 0.38540610671043396, "rewards/accuracy_reward/mean": 0.333984375, "rewards/accuracy_reward/std": 0.47209542989730835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.96875, "rewards/soft_format_reward/std": 0.17416280508041382, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 7232.0, "completions/max_terminated_length": 7232.0, "completions/mean_length": 1152.212890625, "completions/mean_terminated_length": 1340.7568359375, "completions/min_length": 0.0, "completions/min_terminated_length": 431.0, "epoch": 0.4885794552339074, "grad_norm": 0.39777837832174695, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 113309134.0, "reward": 0.66796875, "reward_std": 0.26079481840133667, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9609375, "rewards/soft_format_reward/std": 0.1939331740140915, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.142578125, "completions/max_length": 8176.0, "completions/max_terminated_length": 8176.0, "completions/mean_length": 1032.98046875, "completions/mean_terminated_length": 1204.751708984375, "completions/min_length": 0.0, "completions/min_terminated_length": 427.0, "epoch": 0.490533773054843, "grad_norm": 0.26392963715768764, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 113908852.0, "reward": 0.681640625, "reward_std": 0.2452651858329773, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9765625, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 7773.0, "completions/max_terminated_length": 7773.0, "completions/mean_length": 1102.197265625, "completions/mean_terminated_length": 1193.0760498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.49248809087577866, "grad_norm": 0.27529383925423156, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 114539913.0, "reward": 0.751953125, "reward_std": 0.31733620166778564, "rewards/accuracy_reward/mean": 0.263671875, "rewards/accuracy_reward/std": 0.4410543739795685, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9765625, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 7395.0, "completions/max_terminated_length": 7395.0, "completions/mean_length": 1006.037109375, "completions/mean_terminated_length": 1057.6817626953125, "completions/min_length": 0.0, "completions/min_terminated_length": 506.0, "epoch": 0.4944424086967143, "grad_norm": 0.2879368899238134, "learning_rate": 1e-06, "loss": 0.0509, "num_tokens": 115128316.0, "reward": 0.7939453125, "reward_std": 0.2664770185947418, "rewards/accuracy_reward/mean": 0.30078125, "rewards/accuracy_reward/std": 0.45904624462127686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.986328125, "rewards/soft_format_reward/std": 0.1162383034825325, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 5063.0, "completions/max_terminated_length": 5063.0, "completions/mean_length": 962.44140625, "completions/mean_terminated_length": 1013.9299926757812, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.49639672651764993, "grad_norm": 0.24299119734822017, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 115689150.0, "reward": 0.6796875, "reward_std": 0.18629369139671326, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.984375, "rewards/soft_format_reward/std": 0.12414088100194931, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 1049.736328125, "completions/mean_terminated_length": 1064.287109375, "completions/min_length": 0.0, "completions/min_terminated_length": 460.0, "epoch": 0.49835104433858557, "grad_norm": 0.3477404893024017, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 116292535.0, "reward": 0.6904296875, "reward_std": 0.2398054003715515, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.974609375, "rewards/soft_format_reward/std": 0.15746226906776428, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 3978.0, "completions/max_terminated_length": 3978.0, "completions/mean_length": 1005.923828125, "completions/mean_terminated_length": 1007.892333984375, "completions/min_length": 0.0, "completions/min_terminated_length": 411.0, "epoch": 0.5003053621595211, "grad_norm": 0.2156115651026281, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 116868304.0, "reward": 0.66796875, "reward_std": 0.23239204287528992, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4048.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1019.859375, "completions/mean_terminated_length": 1021.8551635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 414.0, "epoch": 0.5022596799804568, "grad_norm": 0.9399209758441138, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 117449320.0, "reward": 0.6611328125, "reward_std": 0.22365951538085938, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7381.0, "completions/max_terminated_length": 7381.0, "completions/mean_length": 969.4609375, "completions/mean_terminated_length": 969.4609375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5042139978013924, "grad_norm": 0.2244360966335621, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 118001092.0, "reward": 0.76171875, "reward_std": 0.22594638168811798, "rewards/accuracy_reward/mean": 0.263671875, "rewards/accuracy_reward/std": 0.4410543739795685, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3167.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 986.40234375, "completions/mean_terminated_length": 986.40234375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.5061683156223281, "grad_norm": 0.29015636180825405, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 118571442.0, "reward": 0.72265625, "reward_std": 0.23174801468849182, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41643625497817993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 1028.5, "completions/mean_terminated_length": 1028.5, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.5081226334432637, "grad_norm": 0.2176405700101328, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 119180418.0, "reward": 0.720703125, "reward_std": 0.2283822000026703, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4423.0, "completions/max_terminated_length": 4423.0, "completions/mean_length": 997.69921875, "completions/mean_terminated_length": 997.69921875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.5100769512641994, "grad_norm": 0.22842908849300012, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 119768904.0, "reward": 0.83984375, "reward_std": 0.25301796197891235, "rewards/accuracy_reward/mean": 0.33984375, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4113.0, "completions/max_terminated_length": 4113.0, "completions/mean_length": 998.474609375, "completions/mean_terminated_length": 998.474609375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.512031269085135, "grad_norm": 0.1840915908201, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 120352107.0, "reward": 0.7314453125, "reward_std": 0.1765916347503662, "rewards/accuracy_reward/mean": 0.232421875, "rewards/accuracy_reward/std": 0.42278963327407837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3735.0, "completions/max_terminated_length": 3735.0, "completions/mean_length": 985.5859375, "completions/mean_terminated_length": 985.5859375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.5139855869060705, "grad_norm": 0.22731725715057885, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 120921511.0, "reward": 0.734375, "reward_std": 0.27029913663864136, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3701.0, "completions/max_terminated_length": 3701.0, "completions/mean_length": 1013.005859375, "completions/mean_terminated_length": 1013.005859375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.5159399047270062, "grad_norm": 0.3162499362581272, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 121500890.0, "reward": 0.7724609375, "reward_std": 0.2833487391471863, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.4461594223976135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3627.0, "completions/max_terminated_length": 3627.0, "completions/mean_length": 920.548828125, "completions/mean_terminated_length": 920.548828125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.5178942225479418, "grad_norm": 0.23792807621234424, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 122041043.0, "reward": 0.916015625, "reward_std": 0.28482675552368164, "rewards/accuracy_reward/mean": 0.416015625, "rewards/accuracy_reward/std": 0.493378221988678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4471.0, "completions/max_terminated_length": 4471.0, "completions/mean_length": 1087.7578125, "completions/mean_terminated_length": 1087.7578125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.5198485403688775, "grad_norm": 0.2552252902728524, "learning_rate": 1e-06, "loss": -0.0169, "num_tokens": 122679143.0, "reward": 0.8125, "reward_std": 0.21330136060714722, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4639657139778137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4663.0, "completions/max_terminated_length": 4663.0, "completions/mean_length": 994.794921875, "completions/mean_terminated_length": 996.74169921875, "completions/min_length": 0.0, "completions/min_terminated_length": 445.0, "epoch": 0.5218028581898131, "grad_norm": 0.29560652769427187, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 123257486.0, "reward": 0.6318359375, "reward_std": 0.21056944131851196, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4422.0, "completions/max_terminated_length": 4422.0, "completions/mean_length": 1029.32421875, "completions/mean_terminated_length": 1037.4290771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 488.0, "epoch": 0.5237571760107488, "grad_norm": 0.20298588197620862, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 123850324.0, "reward": 0.6572265625, "reward_std": 0.16812849044799805, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 7146.0, "completions/max_terminated_length": 7146.0, "completions/mean_length": 1039.353515625, "completions/mean_terminated_length": 1057.9501953125, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.5257114938316844, "grad_norm": 0.2642426401711122, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 124466681.0, "reward": 0.7431640625, "reward_std": 0.3039402961730957, "rewards/accuracy_reward/mean": 0.244140625, "rewards/accuracy_reward/std": 0.42999663949012756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 8017.0, "completions/max_terminated_length": 8017.0, "completions/mean_length": 989.63671875, "completions/mean_terminated_length": 1005.3452758789062, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.52766581165262, "grad_norm": 0.321186877566316, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 125047343.0, "reward": 0.7265625, "reward_std": 0.27988117933273315, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 8071.0, "completions/max_terminated_length": 8071.0, "completions/mean_length": 980.1796875, "completions/mean_terminated_length": 989.84619140625, "completions/min_length": 0.0, "completions/min_terminated_length": 462.0, "epoch": 0.5296201294735556, "grad_norm": 0.25814862577428566, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 125623019.0, "reward": 0.7734375, "reward_std": 0.2514013648033142, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.4461594223976135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 3870.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 899.365234375, "completions/mean_terminated_length": 901.125244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 370.0, "epoch": 0.5315744472944912, "grad_norm": 0.30822629477420593, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 126154806.0, "reward": 0.833984375, "reward_std": 0.24169795215129852, "rewards/accuracy_reward/mean": 0.333984375, "rewards/accuracy_reward/std": 0.47209542989730835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 885.96484375, "completions/mean_terminated_length": 887.6986083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 354.0, "epoch": 0.5335287651154269, "grad_norm": 0.4844193958853699, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 126681124.0, "reward": 0.8046875, "reward_std": 0.2762932777404785, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.4607250988483429, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 970.884765625, "completions/mean_terminated_length": 984.3425903320312, "completions/min_length": 0.0, "completions/min_terminated_length": 436.0, "epoch": 0.5354830829363625, "grad_norm": 0.4122744154532177, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 127245113.0, "reward": 0.8466796875, "reward_std": 0.280442476272583, "rewards/accuracy_reward/mean": 0.34765625, "rewards/accuracy_reward/std": 0.47669193148612976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3716.0, "completions/max_terminated_length": 3716.0, "completions/mean_length": 973.455078125, "completions/mean_terminated_length": 977.2725830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 471.0, "epoch": 0.5374374007572982, "grad_norm": 0.4641599866232637, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 127809442.0, "reward": 0.7626953125, "reward_std": 0.30328112840652466, "rewards/accuracy_reward/mean": 0.263671875, "rewards/accuracy_reward/std": 0.4410543739795685, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4072.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 953.28515625, "completions/mean_terminated_length": 958.9037475585938, "completions/min_length": 0.0, "completions/min_terminated_length": 416.0, "epoch": 0.5393917185782338, "grad_norm": 0.4011218132397713, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 128359876.0, "reward": 0.837890625, "reward_std": 0.269453763961792, "rewards/accuracy_reward/mean": 0.337890625, "rewards/accuracy_reward/std": 0.4734536409378052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 882.904296875, "completions/mean_terminated_length": 882.904296875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.5413460363991695, "grad_norm": 0.5510919794170652, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 128872547.0, "reward": 0.69921875, "reward_std": 0.24932363629341125, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2081.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 983.107421875, "completions/mean_terminated_length": 983.107421875, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.543300354220105, "grad_norm": 0.694320230931412, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 129446282.0, "reward": 0.74609375, "reward_std": 0.2507278323173523, "rewards/accuracy_reward/mean": 0.24609375, "rewards/accuracy_reward/std": 0.4311550557613373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 924.23828125, "completions/mean_terminated_length": 926.0469360351562, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.5452546720410407, "grad_norm": 0.28831960857551026, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 129985492.0, "reward": 0.7939453125, "reward_std": 0.24243290722370148, "rewards/accuracy_reward/mean": 0.294921875, "rewards/accuracy_reward/std": 0.4564536213874817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 973.123046875, "completions/mean_terminated_length": 975.0274047851562, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.5472089898619763, "grad_norm": 0.45862523305426195, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 130545859.0, "reward": 0.80078125, "reward_std": 0.22870126366615295, "rewards/accuracy_reward/mean": 0.30078125, "rewards/accuracy_reward/std": 0.45904624462127686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 3546.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 886.798828125, "completions/mean_terminated_length": 890.2765502929688, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 0.5491633076829119, "grad_norm": 0.27981683414820635, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 131065628.0, "reward": 0.716796875, "reward_std": 0.24010811746120453, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 3222.0, "completions/max_terminated_length": 3222.0, "completions/mean_length": 923.705078125, "completions/mean_terminated_length": 932.8146362304688, "completions/min_length": 0.0, "completions/min_terminated_length": 298.0, "epoch": 0.5511176255038476, "grad_norm": 0.9012737221127962, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 131605637.0, "reward": 0.748046875, "reward_std": 0.19029605388641357, "rewards/accuracy_reward/mean": 0.248046875, "rewards/accuracy_reward/std": 0.4323015511035919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4594.0, "completions/max_terminated_length": 4594.0, "completions/mean_length": 948.59765625, "completions/mean_terminated_length": 950.4539794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 434.0, "epoch": 0.5530719433247832, "grad_norm": 0.5741973088658604, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 132156151.0, "reward": 0.716796875, "reward_std": 0.2526024580001831, "rewards/accuracy_reward/mean": 0.216796875, "rewards/accuracy_reward/std": 0.4124660789966583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 893.158203125, "completions/mean_terminated_length": 893.158203125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.5550262611457188, "grad_norm": 0.3979017319882626, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 132679368.0, "reward": 0.673828125, "reward_std": 0.20905625820159912, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 949.421875, "completions/mean_terminated_length": 951.2798461914062, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.5569805789666544, "grad_norm": 0.6711761247167988, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 133230224.0, "reward": 0.7265625, "reward_std": 0.267217218875885, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4416.0, "completions/max_terminated_length": 4416.0, "completions/mean_length": 976.94921875, "completions/mean_terminated_length": 976.94921875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.5589348967875901, "grad_norm": 0.2792434772760303, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 133793878.0, "reward": 0.802734375, "reward_std": 0.2886734902858734, "rewards/accuracy_reward/mean": 0.302734375, "rewards/accuracy_reward/std": 0.45989060401916504, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 7203.0, "completions/max_terminated_length": 7203.0, "completions/mean_length": 946.064453125, "completions/mean_terminated_length": 951.6405029296875, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.5608892146085257, "grad_norm": 1.9373572092283484, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 134348103.0, "reward": 0.7109375, "reward_std": 0.2510862350463867, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 954.208984375, "completions/mean_terminated_length": 957.9510498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.5628435324294614, "grad_norm": 24.67543527023826, "learning_rate": 1e-06, "loss": -0.0175, "num_tokens": 134902098.0, "reward": 0.68359375, "reward_std": 0.1904578059911728, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 7087.0, "completions/max_terminated_length": 7087.0, "completions/mean_length": 915.7578125, "completions/mean_terminated_length": 919.3490600585938, "completions/min_length": 0.0, "completions/min_terminated_length": 406.0, "epoch": 0.564797850250397, "grad_norm": 4.570825105490816, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 135432070.0, "reward": 0.712890625, "reward_std": 0.22883236408233643, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 918.921875, "completions/mean_terminated_length": 918.921875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.5667521680713326, "grad_norm": 0.6554731871867421, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 135981006.0, "reward": 0.822265625, "reward_std": 0.24702796339988708, "rewards/accuracy_reward/mean": 0.322265625, "rewards/accuracy_reward/std": 0.46780112385749817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4170.0, "completions/max_terminated_length": 4170.0, "completions/mean_length": 1039.828125, "completions/mean_terminated_length": 1041.863037109375, "completions/min_length": 0.0, "completions/min_terminated_length": 478.0, "epoch": 0.5687064858922682, "grad_norm": 0.36442645531799234, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 136583926.0, "reward": 0.7509765625, "reward_std": 0.2140166163444519, "rewards/accuracy_reward/mean": 0.251953125, "rewards/accuracy_reward/std": 0.43455907702445984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4348.0, "completions/max_terminated_length": 4348.0, "completions/mean_length": 1105.818359375, "completions/mean_terminated_length": 1105.818359375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.5706608037132038, "grad_norm": 0.5143258161999796, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 137219849.0, "reward": 0.6630859375, "reward_std": 0.2245439738035202, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 930.1328125, "completions/mean_terminated_length": 930.1328125, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.5726151215341395, "grad_norm": 0.28065735282251886, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 137788365.0, "reward": 0.916015625, "reward_std": 0.3028775751590729, "rewards/accuracy_reward/mean": 0.416015625, "rewards/accuracy_reward/std": 0.493378221988678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3869.0, "completions/max_terminated_length": 3869.0, "completions/mean_length": 1040.60546875, "completions/mean_terminated_length": 1040.60546875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.5745694393550751, "grad_norm": 0.2291918759972196, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 138395043.0, "reward": 0.8076171875, "reward_std": 0.2563822567462921, "rewards/accuracy_reward/mean": 0.30859375, "rewards/accuracy_reward/std": 0.4623647928237915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4291.0, "completions/max_terminated_length": 4291.0, "completions/mean_length": 996.271484375, "completions/mean_terminated_length": 996.271484375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.5765237571760108, "grad_norm": 0.2593726473999473, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 138982126.0, "reward": 0.873046875, "reward_std": 0.2986268997192383, "rewards/accuracy_reward/mean": 0.373046875, "rewards/accuracy_reward/std": 0.48408737778663635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3692.0, "completions/max_terminated_length": 3692.0, "completions/mean_length": 922.37890625, "completions/mean_terminated_length": 922.37890625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.5784780749969464, "grad_norm": 0.24902058148005377, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 139519552.0, "reward": 0.78515625, "reward_std": 0.21863040328025818, "rewards/accuracy_reward/mean": 0.28515625, "rewards/accuracy_reward/std": 0.45193037390708923, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4822.0, "completions/max_terminated_length": 4822.0, "completions/mean_length": 1075.9609375, "completions/mean_terminated_length": 1075.9609375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.5804323928178821, "grad_norm": 0.20053619451889154, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 140135452.0, "reward": 0.6337890625, "reward_std": 0.21054817736148834, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3689.0, "completions/max_terminated_length": 3689.0, "completions/mean_length": 1006.880859375, "completions/mean_terminated_length": 1006.880859375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.5823867106388176, "grad_norm": 0.2721278374748833, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 140735999.0, "reward": 0.896484375, "reward_std": 0.30730587244033813, "rewards/accuracy_reward/mean": 0.396484375, "rewards/accuracy_reward/std": 0.4896455705165863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2951.0, "completions/max_terminated_length": 2951.0, "completions/mean_length": 915.736328125, "completions/mean_terminated_length": 915.736328125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.5843410284597532, "grad_norm": 0.2755522670107793, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 141269288.0, "reward": 0.974609375, "reward_std": 0.32747682929039, "rewards/accuracy_reward/mean": 0.474609375, "rewards/accuracy_reward/std": 0.4998432695865631, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3861.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 941.318359375, "completions/mean_terminated_length": 941.318359375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5862953462806889, "grad_norm": 0.2648539790797084, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 141842187.0, "reward": 0.9482421875, "reward_std": 0.3025415539741516, "rewards/accuracy_reward/mean": 0.44921875, "rewards/accuracy_reward/std": 0.497901052236557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 895.84375, "completions/mean_terminated_length": 895.84375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5882496641016245, "grad_norm": 0.30717603139617383, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 142376779.0, "reward": 0.955078125, "reward_std": 0.27775871753692627, "rewards/accuracy_reward/mean": 0.455078125, "rewards/accuracy_reward/std": 0.4984649419784546, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3184.0, "completions/max_terminated_length": 3184.0, "completions/mean_length": 921.248046875, "completions/mean_terminated_length": 921.248046875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5902039819225602, "grad_norm": 0.26303870393487144, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 142914010.0, "reward": 0.927734375, "reward_std": 0.28719258308410645, "rewards/accuracy_reward/mean": 0.427734375, "rewards/accuracy_reward/std": 0.4952339828014374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 6394.0, "completions/max_terminated_length": 6394.0, "completions/mean_length": 1127.466796875, "completions/mean_terminated_length": 1131.8883056640625, "completions/min_length": 0.0, "completions/min_terminated_length": 407.0, "epoch": 0.5921582997434958, "grad_norm": 0.1859365526140732, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 143563609.0, "reward": 0.78125, "reward_std": 0.3581286370754242, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45004892349243164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4257.0, "completions/max_terminated_length": 4257.0, "completions/mean_length": 1075.068359375, "completions/mean_terminated_length": 1077.1722412109375, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.5941126175644315, "grad_norm": 0.20976677263274324, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 144184732.0, "reward": 0.794921875, "reward_std": 0.2793903350830078, "rewards/accuracy_reward/mean": 0.294921875, "rewards/accuracy_reward/std": 0.4564536213874817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 4493.0, "completions/max_terminated_length": 4493.0, "completions/mean_length": 1056.142578125, "completions/mean_terminated_length": 1085.833251953125, "completions/min_length": 0.0, "completions/min_terminated_length": 505.0, "epoch": 0.596066935385367, "grad_norm": 0.2370549287713461, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 144796245.0, "reward": 0.8154296875, "reward_std": 0.29888656735420227, "rewards/accuracy_reward/mean": 0.31640625, "rewards/accuracy_reward/std": 0.46552830934524536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 8155.0, "completions/max_terminated_length": 8155.0, "completions/mean_length": 1055.77734375, "completions/mean_terminated_length": 1126.16259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 395.0, "epoch": 0.5980212532063027, "grad_norm": 0.2677183667506309, "learning_rate": 1e-06, "loss": -0.0363, "num_tokens": 145422675.0, "reward": 0.763671875, "reward_std": 0.36193913221359253, "rewards/accuracy_reward/mean": 0.26953125, "rewards/accuracy_reward/std": 0.44415023922920227, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98828125, "rewards/soft_format_reward/std": 0.10772226005792618, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4857.0, "completions/max_terminated_length": 4857.0, "completions/mean_length": 1103.701171875, "completions/mean_terminated_length": 1121.2203369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 442.0, "epoch": 0.5999755710272383, "grad_norm": 0.20625630984182472, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 146060442.0, "reward": 0.771484375, "reward_std": 0.2951411008834839, "rewards/accuracy_reward/mean": 0.271484375, "rewards/accuracy_reward/std": 0.44516023993492126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 3827.0, "completions/max_terminated_length": 3827.0, "completions/mean_length": 1129.08203125, "completions/mean_terminated_length": 1149.2843017578125, "completions/min_length": 0.0, "completions/min_terminated_length": 475.0, "epoch": 0.6019298888481739, "grad_norm": 0.22505773820935762, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 146717188.0, "reward": 0.814453125, "reward_std": 0.31528496742248535, "rewards/accuracy_reward/mean": 0.314453125, "rewards/accuracy_reward/std": 0.4647517800331116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4405.0, "completions/max_terminated_length": 4405.0, "completions/mean_length": 1142.482421875, "completions/mean_terminated_length": 1151.4783935546875, "completions/min_length": 0.0, "completions/min_terminated_length": 539.0, "epoch": 0.6038842066691096, "grad_norm": 0.1971569822415046, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 147377931.0, "reward": 0.859375, "reward_std": 0.3297439515590668, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48028653860092163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5132.0, "completions/max_terminated_length": 5132.0, "completions/mean_length": 1025.68359375, "completions/mean_terminated_length": 1025.68359375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.6058385244900452, "grad_norm": 0.2330771803944694, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 147970057.0, "reward": 0.884765625, "reward_std": 0.31399965286254883, "rewards/accuracy_reward/mean": 0.384765625, "rewards/accuracy_reward/std": 0.4870156943798065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 965.142578125, "completions/mean_terminated_length": 967.0313110351562, "completions/min_length": 0.0, "completions/min_terminated_length": 276.0, "epoch": 0.6077928423109809, "grad_norm": 0.2278993739996118, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 148527714.0, "reward": 0.8203125, "reward_std": 0.3155643939971924, "rewards/accuracy_reward/mean": 0.3203125, "rewards/accuracy_reward/std": 0.4670529365539551, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4086.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1019.064453125, "completions/mean_terminated_length": 1019.064453125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.6097471601319164, "grad_norm": 0.21361335909439352, "learning_rate": 1e-06, "loss": -0.0124, "num_tokens": 149118387.0, "reward": 0.841796875, "reward_std": 0.23987311124801636, "rewards/accuracy_reward/mean": 0.341796875, "rewards/accuracy_reward/std": 0.4747757613658905, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3634.0, "completions/max_terminated_length": 3634.0, "completions/mean_length": 1108.998046875, "completions/mean_terminated_length": 1108.998046875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.6117014779528521, "grad_norm": 0.18715575552255137, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 149756690.0, "reward": 0.7890625, "reward_std": 0.28230345249176025, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45377036929130554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4580.0, "completions/max_terminated_length": 4580.0, "completions/mean_length": 1173.43359375, "completions/mean_terminated_length": 1173.43359375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.6136557957737877, "grad_norm": 0.19816350569687385, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 150421936.0, "reward": 0.81640625, "reward_std": 0.3708382844924927, "rewards/accuracy_reward/mean": 0.31640625, "rewards/accuracy_reward/std": 0.46552830934524536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4249.0, "completions/max_terminated_length": 4249.0, "completions/mean_length": 1102.232421875, "completions/mean_terminated_length": 1102.232421875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.6156101135947234, "grad_norm": 0.21601955360633918, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 151050535.0, "reward": 0.91796875, "reward_std": 0.3318660855293274, "rewards/accuracy_reward/mean": 0.419921875, "rewards/accuracy_reward/std": 0.4940285086631775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 1078.234375, "completions/mean_terminated_length": 1078.234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.617564431415659, "grad_norm": 0.2283945434889413, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 151672575.0, "reward": 0.8828125, "reward_std": 0.34946271777153015, "rewards/accuracy_reward/mean": 0.3828125, "rewards/accuracy_reward/std": 0.486548513174057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5227.0, "completions/max_terminated_length": 5227.0, "completions/mean_length": 1081.626953125, "completions/mean_terminated_length": 1081.626953125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6195187492365946, "grad_norm": 0.20041741622788212, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 152287872.0, "reward": 0.84765625, "reward_std": 0.21362388134002686, "rewards/accuracy_reward/mean": 0.34765625, "rewards/accuracy_reward/std": 0.47669193148612976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4295.0, "completions/max_terminated_length": 4295.0, "completions/mean_length": 1044.775390625, "completions/mean_terminated_length": 1044.775390625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.6214730670575302, "grad_norm": 0.19355071134987356, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 152890317.0, "reward": 0.8798828125, "reward_std": 0.26457107067108154, "rewards/accuracy_reward/mean": 0.380859375, "rewards/accuracy_reward/std": 0.48607301712036133, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3944.0, "completions/max_terminated_length": 3944.0, "completions/mean_length": 1091.482421875, "completions/mean_terminated_length": 1091.482421875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.6234273848784658, "grad_norm": 0.2167859797293233, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 153512580.0, "reward": 0.85546875, "reward_std": 0.3550078868865967, "rewards/accuracy_reward/mean": 0.35546875, "rewards/accuracy_reward/std": 0.47912323474884033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4463.0, "completions/max_terminated_length": 4463.0, "completions/mean_length": 1125.564453125, "completions/mean_terminated_length": 1125.564453125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6253817026994015, "grad_norm": 0.18167371714427352, "learning_rate": 1e-06, "loss": 0.0382, "num_tokens": 154150677.0, "reward": 0.98046875, "reward_std": 0.28993624448776245, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5002445578575134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4365.0, "completions/max_terminated_length": 4365.0, "completions/mean_length": 1042.69921875, "completions/mean_terminated_length": 1044.73974609375, "completions/min_length": 0.0, "completions/min_terminated_length": 350.0, "epoch": 0.6273360205203371, "grad_norm": 0.21140532926422836, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 154743723.0, "reward": 0.9814453125, "reward_std": 0.29769521951675415, "rewards/accuracy_reward/mean": 0.482421875, "rewards/accuracy_reward/std": 0.5001795887947083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 3862.0, "completions/max_terminated_length": 3862.0, "completions/mean_length": 940.111328125, "completions/mean_terminated_length": 941.9510498046875, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.6292903383412728, "grad_norm": 0.20156774180148904, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 155284372.0, "reward": 1.0185546875, "reward_std": 0.2644067108631134, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5001069903373718, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4154.0, "completions/max_terminated_length": 4154.0, "completions/mean_length": 951.70703125, "completions/mean_terminated_length": 951.70703125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6312446561622084, "grad_norm": 0.19357741577941043, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 155826622.0, "reward": 1.0458984375, "reward_std": 0.27975112199783325, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4982847273349762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4951.0, "completions/max_terminated_length": 4951.0, "completions/mean_length": 977.861328125, "completions/mean_terminated_length": 977.861328125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.633198973983144, "grad_norm": 0.25662343345447175, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 156388663.0, "reward": 0.8388671875, "reward_std": 0.306870698928833, "rewards/accuracy_reward/mean": 0.33984375, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4168.0, "completions/max_terminated_length": 4168.0, "completions/mean_length": 995.265625, "completions/mean_terminated_length": 997.2133178710938, "completions/min_length": 0.0, "completions/min_terminated_length": 345.0, "epoch": 0.6351532918040796, "grad_norm": 0.2527045594097066, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 156965023.0, "reward": 0.921875, "reward_std": 0.2805803716182709, "rewards/accuracy_reward/mean": 0.423828125, "rewards/accuracy_reward/std": 0.4946470856666565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3506.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 932.640625, "completions/mean_terminated_length": 932.640625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.6371076096250152, "grad_norm": 0.2431799639626169, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 157506455.0, "reward": 0.896484375, "reward_std": 0.28797444701194763, "rewards/accuracy_reward/mean": 0.396484375, "rewards/accuracy_reward/std": 0.4896455705165863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4566.0, "completions/max_terminated_length": 4566.0, "completions/mean_length": 1044.732421875, "completions/mean_terminated_length": 1044.732421875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6390619274459509, "grad_norm": 0.20157542110570098, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 158095806.0, "reward": 0.86328125, "reward_std": 0.2860869765281677, "rewards/accuracy_reward/mean": 0.36328125, "rewards/accuracy_reward/std": 0.4814152419567108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 960.6015625, "completions/mean_terminated_length": 960.6015625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.6410162452668865, "grad_norm": 0.20509448699954397, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 158656114.0, "reward": 0.90625, "reward_std": 0.32110393047332764, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49161264300346375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1042.87890625, "completions/mean_terminated_length": 1042.87890625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6429705630878222, "grad_norm": 0.16857134446456032, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 159252868.0, "reward": 0.91015625, "reward_std": 0.27819085121154785, "rewards/accuracy_reward/mean": 0.41015625, "rewards/accuracy_reward/std": 0.49234291911125183, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4079.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 919.9296875, "completions/mean_terminated_length": 919.9296875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6449248809087578, "grad_norm": 0.22150563572303353, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 159783200.0, "reward": 0.974609375, "reward_std": 0.32176676392555237, "rewards/accuracy_reward/mean": 0.474609375, "rewards/accuracy_reward/std": 0.4998432695865631, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8046.0, "completions/max_terminated_length": 8046.0, "completions/mean_length": 1018.91796875, "completions/mean_terminated_length": 1018.91796875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6468791987296935, "grad_norm": 0.20252483124411913, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 160361974.0, "reward": 0.962890625, "reward_std": 0.2582211494445801, "rewards/accuracy_reward/mean": 0.462890625, "rewards/accuracy_reward/std": 0.4991086423397064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3622.0, "completions/max_terminated_length": 3622.0, "completions/mean_length": 1080.962890625, "completions/mean_terminated_length": 1080.962890625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.648833516550629, "grad_norm": 0.23036417412465934, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 160975283.0, "reward": 0.986328125, "reward_std": 0.39269354939460754, "rewards/accuracy_reward/mean": 0.486328125, "rewards/accuracy_reward/std": 0.5003018379211426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4387.0, "completions/max_terminated_length": 4387.0, "completions/mean_length": 1199.365234375, "completions/mean_terminated_length": 1199.365234375, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.6507878343715646, "grad_norm": 0.16341466775437016, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 161656366.0, "reward": 0.87109375, "reward_std": 0.28486648201942444, "rewards/accuracy_reward/mean": 0.37109375, "rewards/accuracy_reward/std": 0.4835699498653412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4626.0, "completions/max_terminated_length": 4626.0, "completions/mean_length": 1042.716796875, "completions/mean_terminated_length": 1042.716796875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.6527421521925003, "grad_norm": 0.17750826738799025, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 162253037.0, "reward": 0.935546875, "reward_std": 0.2184910923242569, "rewards/accuracy_reward/mean": 0.435546875, "rewards/accuracy_reward/std": 0.49631330370903015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4957.0, "completions/max_terminated_length": 4957.0, "completions/mean_length": 968.923828125, "completions/mean_terminated_length": 968.923828125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6546964700134359, "grad_norm": 0.1861420754343207, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 162810614.0, "reward": 0.966796875, "reward_std": 0.25689640641212463, "rewards/accuracy_reward/mean": 0.466796875, "rewards/accuracy_reward/std": 0.4993842542171478, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3891.0, "completions/max_terminated_length": 3891.0, "completions/mean_length": 974.318359375, "completions/mean_terminated_length": 974.318359375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6566507878343716, "grad_norm": 0.2244008002693882, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 163372809.0, "reward": 1.037109375, "reward_std": 0.3517191410064697, "rewards/accuracy_reward/mean": 0.537109375, "rewards/accuracy_reward/std": 0.4991086423397064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3963.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 957.732421875, "completions/mean_terminated_length": 957.732421875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.6586051056553072, "grad_norm": 0.20039358943002755, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 163926080.0, "reward": 0.984375, "reward_std": 0.31068554520606995, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5002445578575134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5444.0, "completions/max_terminated_length": 5444.0, "completions/mean_length": 972.892578125, "completions/mean_terminated_length": 972.892578125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.6605594234762429, "grad_norm": 0.23330319568338742, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 164486569.0, "reward": 0.900390625, "reward_std": 0.3330186903476715, "rewards/accuracy_reward/mean": 0.400390625, "rewards/accuracy_reward/std": 0.4904567301273346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2522.0, "completions/max_terminated_length": 2522.0, "completions/mean_length": 954.451171875, "completions/mean_terminated_length": 954.451171875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6625137412971784, "grad_norm": 0.1880611200124491, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 165036656.0, "reward": 0.95703125, "reward_std": 0.2559651732444763, "rewards/accuracy_reward/mean": 0.45703125, "rewards/accuracy_reward/std": 0.49863746762275696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5267.0, "completions/max_terminated_length": 5267.0, "completions/mean_length": 1018.328125, "completions/mean_terminated_length": 1018.328125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6644680591181141, "grad_norm": 0.19915271630621767, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 165619032.0, "reward": 0.9423828125, "reward_std": 0.36097452044487, "rewards/accuracy_reward/mean": 0.443359375, "rewards/accuracy_reward/std": 0.49726733565330505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4165.0, "completions/max_terminated_length": 4165.0, "completions/mean_length": 906.5546875, "completions/mean_terminated_length": 906.5546875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6664223769390497, "grad_norm": 0.20277825440489383, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 166130756.0, "reward": 1.126953125, "reward_std": 0.281166672706604, "rewards/accuracy_reward/mean": 0.626953125, "rewards/accuracy_reward/std": 0.48408737778663635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5169.0, "completions/max_terminated_length": 5169.0, "completions/mean_length": 998.9765625, "completions/mean_terminated_length": 998.9765625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6683766947599853, "grad_norm": 0.1702251710038088, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 166696248.0, "reward": 0.955078125, "reward_std": 0.27338704466819763, "rewards/accuracy_reward/mean": 0.455078125, "rewards/accuracy_reward/std": 0.4984649419784546, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4518.0, "completions/max_terminated_length": 4518.0, "completions/mean_length": 1054.990234375, "completions/mean_terminated_length": 1054.990234375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.670331012580921, "grad_norm": 0.2023491526993484, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 167293539.0, "reward": 1.0185546875, "reward_std": 0.33125221729278564, "rewards/accuracy_reward/mean": 0.521484375, "rewards/accuracy_reward/std": 0.5000267624855042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2903.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 922.474609375, "completions/mean_terminated_length": 922.474609375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.6722853304018566, "grad_norm": 0.21977580067642352, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 167827126.0, "reward": 1.052734375, "reward_std": 0.3389337956905365, "rewards/accuracy_reward/mean": 0.552734375, "rewards/accuracy_reward/std": 0.4976975917816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3251.0, "completions/max_terminated_length": 3251.0, "completions/mean_length": 1021.240234375, "completions/mean_terminated_length": 1021.240234375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6742396482227923, "grad_norm": 0.18814944592818797, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 168408193.0, "reward": 1.029296875, "reward_std": 0.30225419998168945, "rewards/accuracy_reward/mean": 0.529296875, "rewards/accuracy_reward/std": 0.49962911009788513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4461.0, "completions/max_terminated_length": 4461.0, "completions/mean_length": 979.4453125, "completions/mean_terminated_length": 979.4453125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6761939660437278, "grad_norm": 0.20340291514882095, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 168965941.0, "reward": 0.9453125, "reward_std": 0.3172352910041809, "rewards/accuracy_reward/mean": 0.4453125, "rewards/accuracy_reward/std": 0.49748632311820984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 956.697265625, "completions/mean_terminated_length": 956.697265625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6781482838646635, "grad_norm": 0.17474088253640468, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 169512314.0, "reward": 0.8671875, "reward_std": 0.23352685570716858, "rewards/accuracy_reward/mean": 0.3671875, "rewards/accuracy_reward/std": 0.48250964283943176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4268.0, "completions/max_terminated_length": 4268.0, "completions/mean_length": 1066.955078125, "completions/mean_terminated_length": 1066.955078125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6801026016855991, "grad_norm": 0.19888757329448092, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 170127539.0, "reward": 0.888671875, "reward_std": 0.24496980011463165, "rewards/accuracy_reward/mean": 0.388671875, "rewards/accuracy_reward/std": 0.4879252314567566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7847.0, "completions/max_terminated_length": 7847.0, "completions/mean_length": 999.2578125, "completions/mean_terminated_length": 999.2578125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.6820569195065348, "grad_norm": 0.2395324660694693, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 170701399.0, "reward": 1.0009765625, "reward_std": 0.32716602087020874, "rewards/accuracy_reward/mean": 0.501953125, "rewards/accuracy_reward/std": 0.5004851818084717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 929.82421875, "completions/mean_terminated_length": 929.82421875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.6840112373274704, "grad_norm": 0.22050010168287415, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 171235933.0, "reward": 0.99609375, "reward_std": 0.32060712575912476, "rewards/accuracy_reward/mean": 0.49609375, "rewards/accuracy_reward/std": 0.5004737377166748, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5183.0, "completions/max_terminated_length": 5183.0, "completions/mean_length": 983.298828125, "completions/mean_terminated_length": 983.298828125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.685965555148406, "grad_norm": 0.20864405778217568, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 171802806.0, "reward": 0.8955078125, "reward_std": 0.26462340354919434, "rewards/accuracy_reward/mean": 0.396484375, "rewards/accuracy_reward/std": 0.4896455705165863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 992.28515625, "completions/mean_terminated_length": 992.28515625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.6879198729693417, "grad_norm": 0.1939572330378823, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 172367336.0, "reward": 0.904296875, "reward_std": 0.27583566308021545, "rewards/accuracy_reward/mean": 0.404296875, "rewards/accuracy_reward/std": 0.4912354052066803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5556.0, "completions/max_terminated_length": 5556.0, "completions/mean_length": 993.232421875, "completions/mean_terminated_length": 993.232421875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.6898741907902772, "grad_norm": 0.21178878140983268, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 172934575.0, "reward": 1.01953125, "reward_std": 0.252077579498291, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5001069903373718, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4506.0, "completions/max_terminated_length": 4506.0, "completions/mean_length": 1025.25, "completions/mean_terminated_length": 1027.25634765625, "completions/min_length": 0.0, "completions/min_terminated_length": 447.0, "epoch": 0.6918285086112129, "grad_norm": 0.2220756692952701, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 173516511.0, "reward": 0.849609375, "reward_std": 0.30644211173057556, "rewards/accuracy_reward/mean": 0.3515625, "rewards/accuracy_reward/std": 0.4779251217842102, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3937.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 973.056640625, "completions/mean_terminated_length": 973.056640625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6937828264321485, "grad_norm": 0.2723223044180947, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 174083500.0, "reward": 0.9736328125, "reward_std": 0.3103483319282532, "rewards/accuracy_reward/mean": 0.474609375, "rewards/accuracy_reward/std": 0.4998432695865631, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6884.0, "completions/max_terminated_length": 6884.0, "completions/mean_length": 959.931640625, "completions/mean_terminated_length": 959.931640625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6957371442530842, "grad_norm": 0.2234927815528302, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 174633833.0, "reward": 0.978515625, "reward_std": 0.30642932653427124, "rewards/accuracy_reward/mean": 0.478515625, "rewards/accuracy_reward/std": 0.5000267624855042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 6013.0, "completions/max_terminated_length": 6013.0, "completions/mean_length": 939.26171875, "completions/mean_terminated_length": 941.0997924804688, "completions/min_length": 0.0, "completions/min_terminated_length": 291.0, "epoch": 0.6976914620740198, "grad_norm": 0.2809864981267674, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 175178799.0, "reward": 0.998046875, "reward_std": 0.3423428535461426, "rewards/accuracy_reward/mean": 0.498046875, "rewards/accuracy_reward/std": 0.5004851818084717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4325.0, "completions/max_terminated_length": 4325.0, "completions/mean_length": 1063.2890625, "completions/mean_terminated_length": 1063.2890625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.6996457798949555, "grad_norm": 0.19185919129350879, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 175778979.0, "reward": 0.8388671875, "reward_std": 0.2665994167327881, "rewards/accuracy_reward/mean": 0.33984375, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 966.79296875, "completions/mean_terminated_length": 966.79296875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.701600097715891, "grad_norm": 0.21556774284686625, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 176332777.0, "reward": 1.037109375, "reward_std": 0.2835124731063843, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4989593029022217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 5211.0, "completions/max_terminated_length": 5211.0, "completions/mean_length": 1076.86328125, "completions/mean_terminated_length": 1078.9705810546875, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.7035544155368266, "grad_norm": 0.19706033942634024, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 176949043.0, "reward": 0.96875, "reward_std": 0.31767719984054565, "rewards/accuracy_reward/mean": 0.47265625, "rewards/accuracy_reward/std": 0.49974003434181213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6451.0, "completions/max_terminated_length": 6451.0, "completions/mean_length": 1125.8359375, "completions/mean_terminated_length": 1125.8359375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7055087333577623, "grad_norm": 0.23532020192287942, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 177586511.0, "reward": 0.931640625, "reward_std": 0.341478168964386, "rewards/accuracy_reward/mean": 0.435546875, "rewards/accuracy_reward/std": 0.49631330370903015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4267.0, "completions/max_terminated_length": 4267.0, "completions/mean_length": 1079.51171875, "completions/mean_terminated_length": 1079.51171875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7074630511786979, "grad_norm": 0.23254178529562144, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 178195493.0, "reward": 1.0556640625, "reward_std": 0.40996870398521423, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4970405399799347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4531.0, "completions/max_terminated_length": 4531.0, "completions/mean_length": 1043.580078125, "completions/mean_terminated_length": 1043.580078125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.7094173689996336, "grad_norm": 0.21972027991746884, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 178797582.0, "reward": 0.794921875, "reward_std": 0.23968225717544556, "rewards/accuracy_reward/mean": 0.298828125, "rewards/accuracy_reward/std": 0.45819199085235596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4170.0, "completions/max_terminated_length": 4170.0, "completions/mean_length": 1076.59765625, "completions/mean_terminated_length": 1076.59765625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7113716868205692, "grad_norm": 0.22569527507457063, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 179408256.0, "reward": 0.85546875, "reward_std": 0.2965734899044037, "rewards/accuracy_reward/mean": 0.357421875, "rewards/accuracy_reward/std": 0.4797092080116272, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3941.0, "completions/max_terminated_length": 3941.0, "completions/mean_length": 979.7578125, "completions/mean_terminated_length": 979.7578125, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7133260046415049, "grad_norm": 0.2065068796590369, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 179976676.0, "reward": 0.8095703125, "reward_std": 0.2661935091018677, "rewards/accuracy_reward/mean": 0.310546875, "rewards/accuracy_reward/std": 0.46317005157470703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3976.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 969.2734375, "completions/mean_terminated_length": 969.2734375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.7152803224624404, "grad_norm": 0.20936776707413618, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 180530416.0, "reward": 1.05078125, "reward_std": 0.29084792733192444, "rewards/accuracy_reward/mean": 0.55078125, "rewards/accuracy_reward/std": 0.497901052236557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 1004.86328125, "completions/mean_terminated_length": 1004.86328125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.7172346402833761, "grad_norm": 0.20583825538387396, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 181105850.0, "reward": 0.9111328125, "reward_std": 0.2520977854728699, "rewards/accuracy_reward/mean": 0.412109375, "rewards/accuracy_reward/std": 0.49269601702690125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4012.0, "completions/max_terminated_length": 4012.0, "completions/mean_length": 1023.056640625, "completions/mean_terminated_length": 1023.056640625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.7191889581043117, "grad_norm": 0.21930718293078055, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 181687975.0, "reward": 0.935546875, "reward_std": 0.28961291909217834, "rewards/accuracy_reward/mean": 0.435546875, "rewards/accuracy_reward/std": 0.49631330370903015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3805.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 975.69921875, "completions/mean_terminated_length": 975.69921875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.7211432759252473, "grad_norm": 0.23388149896309182, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 182271981.0, "reward": 1.205078125, "reward_std": 0.2989095449447632, "rewards/accuracy_reward/mean": 0.705078125, "rewards/accuracy_reward/std": 0.4564536213874817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 865.365234375, "completions/mean_terminated_length": 865.365234375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.723097593746183, "grad_norm": 0.25795839683205546, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 182794040.0, "reward": 1.1806640625, "reward_std": 0.2649618089199066, "rewards/accuracy_reward/mean": 0.681640625, "rewards/accuracy_reward/std": 0.46629536151885986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 874.736328125, "completions/mean_terminated_length": 874.736328125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.7250519115671186, "grad_norm": 0.2619867502554444, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 183338577.0, "reward": 1.16015625, "reward_std": 0.25967979431152344, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4121.0, "completions/max_terminated_length": 4121.0, "completions/mean_length": 1094.76953125, "completions/mean_terminated_length": 1094.76953125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.7270062293880543, "grad_norm": 0.2576361395485926, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 183970283.0, "reward": 1.0302734375, "reward_std": 0.3692060112953186, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4995105266571045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6671.0, "completions/max_terminated_length": 6671.0, "completions/mean_length": 1063.619140625, "completions/mean_terminated_length": 1063.619140625, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.7289605472089898, "grad_norm": 0.24802752750639284, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 184582360.0, "reward": 1.0576171875, "reward_std": 0.32990533113479614, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4970405399799347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4208.0, "completions/max_terminated_length": 4208.0, "completions/mean_length": 1165.00390625, "completions/mean_terminated_length": 1165.00390625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.7309148650299255, "grad_norm": 0.2404454165210168, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 185247162.0, "reward": 1.0302734375, "reward_std": 0.3166176676750183, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4995105266571045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4869.0, "completions/max_terminated_length": 4869.0, "completions/mean_length": 1084.7578125, "completions/mean_terminated_length": 1084.7578125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7328691828508611, "grad_norm": 0.24105245494477076, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 185876350.0, "reward": 0.9599609375, "reward_std": 0.2650347948074341, "rewards/accuracy_reward/mean": 0.4609375, "rewards/accuracy_reward/std": 0.4989593029022217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4852.0, "completions/max_terminated_length": 4852.0, "completions/mean_length": 1198.470703125, "completions/mean_terminated_length": 1198.470703125, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.7348235006717968, "grad_norm": 0.2459399628221602, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 186561839.0, "reward": 0.86328125, "reward_std": 0.3282925486564636, "rewards/accuracy_reward/mean": 0.36328125, "rewards/accuracy_reward/std": 0.4814152419567108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5050.0, "completions/max_terminated_length": 5050.0, "completions/mean_length": 1275.294921875, "completions/mean_terminated_length": 1275.294921875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.7367778184927324, "grad_norm": 0.2176582509081832, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 187293654.0, "reward": 0.908203125, "reward_std": 0.27721935510635376, "rewards/accuracy_reward/mean": 0.408203125, "rewards/accuracy_reward/std": 0.49198177456855774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 1073.748046875, "completions/mean_terminated_length": 1073.748046875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.738732136313668, "grad_norm": 0.2120312763275498, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 187916085.0, "reward": 1.10546875, "reward_std": 0.26140832901000977, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.4892277717590332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5750.0, "completions/max_terminated_length": 5750.0, "completions/mean_length": 949.21875, "completions/mean_terminated_length": 949.21875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.7406864541346037, "grad_norm": 0.27885106998279063, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 188482933.0, "reward": 1.1767578125, "reward_std": 0.27184367179870605, "rewards/accuracy_reward/mean": 0.677734375, "rewards/accuracy_reward/std": 0.46780112385749817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4050.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 1103.09765625, "completions/mean_terminated_length": 1103.09765625, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.7426407719555392, "grad_norm": 0.22653791246160732, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 189148743.0, "reward": 0.986328125, "reward_std": 0.2644691467285156, "rewards/accuracy_reward/mean": 0.486328125, "rewards/accuracy_reward/std": 0.5003018379211426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 950.05859375, "completions/mean_terminated_length": 950.05859375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.7445950897764749, "grad_norm": 0.26884527182128715, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 189723333.0, "reward": 0.966796875, "reward_std": 0.2909739017486572, "rewards/accuracy_reward/mean": 0.466796875, "rewards/accuracy_reward/std": 0.4993842542171478, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3257.0, "completions/max_terminated_length": 3257.0, "completions/mean_length": 1073.556640625, "completions/mean_terminated_length": 1073.556640625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.7465494075974105, "grad_norm": 0.24243344727319555, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 190352690.0, "reward": 1.126953125, "reward_std": 0.3259270489215851, "rewards/accuracy_reward/mean": 0.626953125, "rewards/accuracy_reward/std": 0.48408737778663635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4828.0, "completions/max_terminated_length": 4828.0, "completions/mean_length": 973.80859375, "completions/mean_terminated_length": 973.80859375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.7485037254183462, "grad_norm": 0.2818313890492895, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 190934688.0, "reward": 1.1796875, "reward_std": 0.2989754378795624, "rewards/accuracy_reward/mean": 0.6796875, "rewards/accuracy_reward/std": 0.4670529365539551, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 922.3984375, "completions/mean_terminated_length": 922.3984375, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.7504580432392818, "grad_norm": 0.2664778743355963, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 191461164.0, "reward": 1.07421875, "reward_std": 0.3264962434768677, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.4949444830417633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4507.0, "completions/max_terminated_length": 4507.0, "completions/mean_length": 1047.314453125, "completions/mean_terminated_length": 1047.314453125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7524123610602175, "grad_norm": 0.22596544703523644, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 192073085.0, "reward": 1.078125, "reward_std": 0.29244738817214966, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49434176087379456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4225.0, "completions/max_terminated_length": 4225.0, "completions/mean_length": 846.6328125, "completions/mean_terminated_length": 846.6328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.754366678881153, "grad_norm": 0.2680276869113691, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 192578433.0, "reward": 1.228515625, "reward_std": 0.2956398129463196, "rewards/accuracy_reward/mean": 0.728515625, "rewards/accuracy_reward/std": 0.44516023993492126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 939.390625, "completions/mean_terminated_length": 939.390625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.7563209967020886, "grad_norm": 0.25647382882660336, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 193128841.0, "reward": 1.25, "reward_std": 0.26654744148254395, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.43343618512153625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 860.400390625, "completions/mean_terminated_length": 860.400390625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7582753145230243, "grad_norm": 0.3148442209674101, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 193634582.0, "reward": 1.1591796875, "reward_std": 0.3296191096305847, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 948.8125, "completions/mean_terminated_length": 948.8125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7602296323439599, "grad_norm": 0.2574471346650531, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 194198406.0, "reward": 1.169921875, "reward_std": 0.23618678748607635, "rewards/accuracy_reward/mean": 0.669921875, "rewards/accuracy_reward/std": 0.47070086002349854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4683.0, "completions/max_terminated_length": 4683.0, "completions/mean_length": 1055.330078125, "completions/mean_terminated_length": 1055.330078125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.7621839501648956, "grad_norm": 0.19428781730315361, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 194795391.0, "reward": 1.1376953125, "reward_std": 0.21126717329025269, "rewards/accuracy_reward/mean": 0.638671875, "rewards/accuracy_reward/std": 0.48085519671440125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4069.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 995.46484375, "completions/mean_terminated_length": 995.46484375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.7641382679858312, "grad_norm": 0.2581499589268977, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 195367341.0, "reward": 1.0654296875, "reward_std": 0.29932349920272827, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4960552453994751, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2046.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 862.248046875, "completions/mean_terminated_length": 862.248046875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.7660925858067669, "grad_norm": 0.2818869309096585, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 195868380.0, "reward": 1.19140625, "reward_std": 0.27888891100883484, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.4623647928237915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4574.0, "completions/max_terminated_length": 4574.0, "completions/mean_length": 916.607421875, "completions/mean_terminated_length": 916.607421875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7680469036277024, "grad_norm": 0.2642600120130633, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 196392227.0, "reward": 1.1513671875, "reward_std": 0.2804816961288452, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.47669193148612976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 7538.0, "completions/max_terminated_length": 7538.0, "completions/mean_length": 1020.80859375, "completions/mean_terminated_length": 1022.8062744140625, "completions/min_length": 0.0, "completions/min_terminated_length": 401.0, "epoch": 0.7700012214486381, "grad_norm": 0.26642048952222164, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 196981505.0, "reward": 1.0458984375, "reward_std": 0.2517250180244446, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4982847273349762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4545.0, "completions/max_terminated_length": 4545.0, "completions/mean_length": 1059.181640625, "completions/mean_terminated_length": 1059.181640625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.7719555392695737, "grad_norm": 0.2531640916757571, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 197587022.0, "reward": 0.939453125, "reward_std": 0.25433334708213806, "rewards/accuracy_reward/mean": 0.439453125, "rewards/accuracy_reward/std": 0.49680593609809875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4299.0, "completions/max_terminated_length": 4299.0, "completions/mean_length": 1077.2265625, "completions/mean_terminated_length": 1077.2265625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.7739098570905093, "grad_norm": 0.27930445296157, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 198201346.0, "reward": 1.0126953125, "reward_std": 0.28195977210998535, "rewards/accuracy_reward/mean": 0.513671875, "rewards/accuracy_reward/std": 0.5003018379211426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4401.0, "completions/max_terminated_length": 4401.0, "completions/mean_length": 1085.13671875, "completions/mean_terminated_length": 1085.13671875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.775864174911445, "grad_norm": 0.25743249589936545, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 198812424.0, "reward": 1.041015625, "reward_std": 0.326663613319397, "rewards/accuracy_reward/mean": 0.541015625, "rewards/accuracy_reward/std": 0.49880221486091614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6222.0, "completions/max_terminated_length": 6222.0, "completions/mean_length": 1120.265625, "completions/mean_terminated_length": 1120.265625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.7778184927323806, "grad_norm": 0.2503161354384062, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 199453456.0, "reward": 0.9072265625, "reward_std": 0.29364052414894104, "rewards/accuracy_reward/mean": 0.408203125, "rewards/accuracy_reward/std": 0.49198177456855774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 971.03515625, "completions/mean_terminated_length": 971.03515625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7797728105533163, "grad_norm": 0.27548667048132863, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 200014434.0, "reward": 0.982421875, "reward_std": 0.18923774361610413, "rewards/accuracy_reward/mean": 0.482421875, "rewards/accuracy_reward/std": 0.5001795887947083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3749.0, "completions/max_terminated_length": 3749.0, "completions/mean_length": 1073.80859375, "completions/mean_terminated_length": 1073.80859375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.7817271283742518, "grad_norm": 0.3086954039100987, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 200624272.0, "reward": 1.0283203125, "reward_std": 0.3033173680305481, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4995105266571045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4564.0, "completions/max_terminated_length": 4564.0, "completions/mean_length": 1044.466796875, "completions/mean_terminated_length": 1044.466796875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.7836814461951875, "grad_norm": 0.2684355677148113, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 201222863.0, "reward": 0.962890625, "reward_std": 0.28903210163116455, "rewards/accuracy_reward/mean": 0.462890625, "rewards/accuracy_reward/std": 0.4991086423397064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 942.828125, "completions/mean_terminated_length": 942.828125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.7856357640161231, "grad_norm": 0.28069683172528026, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 201762471.0, "reward": 1.109375, "reward_std": 0.2700616121292114, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48836761713027954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5010.0, "completions/max_terminated_length": 5010.0, "completions/mean_length": 728.0390625, "completions/mean_terminated_length": 728.0390625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7875900818370587, "grad_norm": 0.3505792077876118, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 202198155.0, "reward": 1.205078125, "reward_std": 0.273173987865448, "rewards/accuracy_reward/mean": 0.705078125, "rewards/accuracy_reward/std": 0.4564536213874817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3155.0, "completions/max_terminated_length": 3155.0, "completions/mean_length": 831.5546875, "completions/mean_terminated_length": 831.5546875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.7895443996579944, "grad_norm": 0.35387225051476173, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 202693159.0, "reward": 1.208984375, "reward_std": 0.26959553360939026, "rewards/accuracy_reward/mean": 0.708984375, "rewards/accuracy_reward/std": 0.45467492938041687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 7718.0, "completions/max_terminated_length": 7718.0, "completions/mean_length": 822.166015625, "completions/mean_terminated_length": 823.7749633789062, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.79149871747893, "grad_norm": 0.3237887263193711, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 203175356.0, "reward": 1.2001953125, "reward_std": 0.24427789449691772, "rewards/accuracy_reward/mean": 0.701171875, "rewards/accuracy_reward/std": 0.45819199085235596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5268.0, "completions/max_terminated_length": 5268.0, "completions/mean_length": 857.345703125, "completions/mean_terminated_length": 857.345703125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7934530352998657, "grad_norm": 0.29168011380998693, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 203676685.0, "reward": 1.28125, "reward_std": 0.23003166913986206, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41380295157432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3649.0, "completions/max_terminated_length": 3649.0, "completions/mean_length": 811.177734375, "completions/mean_terminated_length": 811.177734375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.7954073531208012, "grad_norm": 0.376105611547192, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 204168392.0, "reward": 1.134765625, "reward_std": 0.26618099212646484, "rewards/accuracy_reward/mean": 0.634765625, "rewards/accuracy_reward/std": 0.4819667339324951, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5714.0, "completions/max_terminated_length": 5714.0, "completions/mean_length": 869.43359375, "completions/mean_terminated_length": 869.43359375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.7973616709417369, "grad_norm": 0.28847549854200055, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 204673574.0, "reward": 1.2021484375, "reward_std": 0.2509317994117737, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45732781291007996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.998046875, "rewards/soft_format_reward/std": 0.04419417306780815, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5213.0, "completions/max_terminated_length": 5213.0, "completions/mean_length": 868.015625, "completions/mean_terminated_length": 868.015625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.7993159887626725, "grad_norm": 0.319636585629475, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 205178894.0, "reward": 1.171875, "reward_std": 0.29458165168762207, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4699897766113281, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2088.0, "completions/max_terminated_length": 2088.0, "completions/mean_length": 811.818359375, "completions/mean_terminated_length": 811.818359375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8012703065836082, "grad_norm": 0.35531193872391, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 205650817.0, "reward": 1.212890625, "reward_std": 0.2410779893398285, "rewards/accuracy_reward/mean": 0.712890625, "rewards/accuracy_reward/std": 0.45285552740097046, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5580.0, "completions/max_terminated_length": 5580.0, "completions/mean_length": 783.1484375, "completions/mean_terminated_length": 783.1484375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8032246244045438, "grad_norm": 0.3643183188974139, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 206109965.0, "reward": 1.328125, "reward_std": 0.18987837433815002, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4124.0, "completions/max_terminated_length": 4124.0, "completions/mean_length": 795.90234375, "completions/mean_terminated_length": 795.90234375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8051789422254794, "grad_norm": 0.3573674837969747, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 206575931.0, "reward": 1.220703125, "reward_std": 0.2888934016227722, "rewards/accuracy_reward/mean": 0.720703125, "rewards/accuracy_reward/std": 0.44909247756004333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 766.810546875, "completions/mean_terminated_length": 766.810546875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8071332600464151, "grad_norm": 0.3521201101619864, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 207032570.0, "reward": 1.234375, "reward_std": 0.2204812467098236, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44209739565849304, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7929.0, "completions/max_terminated_length": 7929.0, "completions/mean_length": 909.228515625, "completions/mean_terminated_length": 909.228515625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8090875778673506, "grad_norm": 0.35595265396748055, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 207576111.0, "reward": 1.12109375, "reward_std": 0.2677309215068817, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4855891764163971, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5867.0, "completions/max_terminated_length": 5867.0, "completions/mean_length": 842.884765625, "completions/mean_terminated_length": 842.884765625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.8110418956882863, "grad_norm": 0.2703294250943154, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 208069988.0, "reward": 1.21484375, "reward_std": 0.22637133300304413, "rewards/accuracy_reward/mean": 0.71484375, "rewards/accuracy_reward/std": 0.45193037390708923, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 5990.0, "completions/max_terminated_length": 5990.0, "completions/mean_length": 843.8984375, "completions/mean_terminated_length": 850.5432739257812, "completions/min_length": 0.0, "completions/min_terminated_length": 332.0, "epoch": 0.8129962135092219, "grad_norm": 0.3370587897049859, "learning_rate": 1e-06, "loss": -0.0377, "num_tokens": 208567328.0, "reward": 1.060546875, "reward_std": 0.26288777589797974, "rewards/accuracy_reward/mean": 0.564453125, "rewards/accuracy_reward/std": 0.49631330370903015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 6346.0, "completions/max_terminated_length": 6346.0, "completions/mean_length": 783.865234375, "completions/mean_terminated_length": 788.4852905273438, "completions/min_length": 0.0, "completions/min_terminated_length": 317.0, "epoch": 0.8149505313301576, "grad_norm": 0.39665042850965304, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 209044107.0, "reward": 1.201171875, "reward_std": 0.29087740182876587, "rewards/accuracy_reward/mean": 0.705078125, "rewards/accuracy_reward/std": 0.4564536213874817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2263.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 724.42578125, "completions/mean_terminated_length": 727.2667236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.8169048491510932, "grad_norm": 0.42912039696135573, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 209476085.0, "reward": 1.2353515625, "reward_std": 0.31185171008110046, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.44000017642974854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 783.4921875, "completions/mean_terminated_length": 785.0254516601562, "completions/min_length": 0.0, "completions/min_terminated_length": 277.0, "epoch": 0.8188591669720289, "grad_norm": 0.3416888766786459, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 209933121.0, "reward": 1.1796875, "reward_std": 0.24416999518871307, "rewards/accuracy_reward/mean": 0.681640625, "rewards/accuracy_reward/std": 0.46629536151885986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 7035.0, "completions/max_terminated_length": 7035.0, "completions/mean_length": 893.9609375, "completions/mean_terminated_length": 904.561279296875, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.8208134847929645, "grad_norm": 0.33763547934277416, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 210451181.0, "reward": 1.1533203125, "reward_std": 0.28630441427230835, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.986328125, "rewards/soft_format_reward/std": 0.1162383034825325, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 7300.0, "completions/max_terminated_length": 7300.0, "completions/mean_length": 834.74609375, "completions/mean_terminated_length": 841.3189086914062, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.8227678026139, "grad_norm": 0.33386172323003377, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 210963211.0, "reward": 1.1533203125, "reward_std": 0.2574579417705536, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.986328125, "rewards/soft_format_reward/std": 0.1162383034825325, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 7065.0, "completions/max_terminated_length": 7065.0, "completions/mean_length": 814.94921875, "completions/mean_terminated_length": 819.7525024414062, "completions/min_length": 0.0, "completions/min_terminated_length": 306.0, "epoch": 0.8247221204348357, "grad_norm": 0.3652594693962747, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 211454097.0, "reward": 1.1005859375, "reward_std": 0.2645086646080017, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.4892277717590332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 7479.0, "completions/max_terminated_length": 7479.0, "completions/mean_length": 795.013671875, "completions/mean_terminated_length": 796.5694580078125, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 0.8266764382557713, "grad_norm": 0.3293846810079901, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 211930280.0, "reward": 1.1337890625, "reward_std": 0.26773542165756226, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.4814152419567108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4412.0, "completions/max_terminated_length": 4412.0, "completions/mean_length": 874.89453125, "completions/mean_terminated_length": 883.522705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 347.0, "epoch": 0.828630756076707, "grad_norm": 0.3535089662836341, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 212440290.0, "reward": 1.13671875, "reward_std": 0.29048609733581543, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.47912323474884033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.984375, "rewards/soft_format_reward/std": 0.12414088100194931, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 6280.0, "completions/max_terminated_length": 6280.0, "completions/mean_length": 893.9921875, "completions/mean_terminated_length": 897.4981079101562, "completions/min_length": 0.0, "completions/min_terminated_length": 390.0, "epoch": 0.8305850738976426, "grad_norm": 0.3185673827005288, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 212974542.0, "reward": 1.125, "reward_std": 0.28310471773147583, "rewards/accuracy_reward/mean": 0.626953125, "rewards/accuracy_reward/std": 0.48408737778663635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4507.0, "completions/max_terminated_length": 4507.0, "completions/mean_length": 900.79296875, "completions/mean_terminated_length": 904.3255615234375, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.8325393917185783, "grad_norm": 0.3267191408879667, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 213512532.0, "reward": 1.0380859375, "reward_std": 0.26320767402648926, "rewards/accuracy_reward/mean": 0.541015625, "rewards/accuracy_reward/std": 0.49880221486091614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 3350.0, "completions/max_terminated_length": 3350.0, "completions/mean_length": 849.65625, "completions/mean_terminated_length": 851.3189697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 332.0, "epoch": 0.8344937095395139, "grad_norm": 0.3430058021350143, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 214019156.0, "reward": 1.13671875, "reward_std": 0.2617988884449005, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48028653860092163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 8024.0, "completions/max_terminated_length": 8024.0, "completions/mean_length": 908.203125, "completions/mean_terminated_length": 911.7647705078125, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.8364480273604495, "grad_norm": 0.3490585175056411, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 214545612.0, "reward": 1.1923828125, "reward_std": 0.30423495173454285, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4607250988483429, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 773.9296875, "completions/mean_terminated_length": 775.4442138671875, "completions/min_length": 0.0, "completions/min_terminated_length": 302.0, "epoch": 0.8384023451813851, "grad_norm": 0.3545128571512148, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 215040664.0, "reward": 1.0986328125, "reward_std": 0.29531624913215637, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4900552034378052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.994140625, "rewards/soft_format_reward/std": 0.07639661431312561, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 5426.0, "completions/max_terminated_length": 5426.0, "completions/mean_length": 816.771484375, "completions/mean_terminated_length": 819.9745483398438, "completions/min_length": 0.0, "completions/min_terminated_length": 319.0, "epoch": 0.8403566630023207, "grad_norm": 0.2718305462169434, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 215523379.0, "reward": 1.166015625, "reward_std": 0.22507482767105103, "rewards/accuracy_reward/mean": 0.669921875, "rewards/accuracy_reward/std": 0.47070086002349854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 841.55078125, "completions/mean_terminated_length": 848.1771850585938, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.8423109808232564, "grad_norm": 0.2822841226319474, "learning_rate": 1e-06, "loss": -0.0183, "num_tokens": 216030333.0, "reward": 1.197265625, "reward_std": 0.25299516320228577, "rewards/accuracy_reward/mean": 0.701171875, "rewards/accuracy_reward/std": 0.45819199085235596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3192.0, "completions/max_terminated_length": 3192.0, "completions/mean_length": 806.685546875, "completions/mean_terminated_length": 806.685546875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.844265298644192, "grad_norm": 0.29442535204726733, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 216511276.0, "reward": 1.11328125, "reward_std": 0.2568894326686859, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.48747459053993225, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 1.0, "rewards/soft_format_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 7614.0, "completions/max_terminated_length": 7614.0, "completions/mean_length": 915.326171875, "completions/mean_terminated_length": 917.117431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 273.0, "epoch": 0.8462196164651277, "grad_norm": 0.3246937572208824, "learning_rate": 1e-06, "loss": -0.0129, "num_tokens": 217038419.0, "reward": 1.072265625, "reward_std": 0.29967623949050903, "rewards/accuracy_reward/mean": 0.576171875, "rewards/accuracy_reward/std": 0.4946470856666565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9921875, "rewards/soft_format_reward/std": 0.08812850713729858, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4563.0, "completions/max_terminated_length": 4563.0, "completions/mean_length": 879.8125, "completions/mean_terminated_length": 879.8125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.8481739342860632, "grad_norm": 0.297249871635069, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 217553843.0, "reward": 1.251953125, "reward_std": 0.2540125250816345, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.4311550557613373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 5545.0, "completions/max_terminated_length": 5545.0, "completions/mean_length": 939.572265625, "completions/mean_terminated_length": 950.7135009765625, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.8501282521069989, "grad_norm": 0.3374361113595728, "learning_rate": 1e-06, "loss": -0.0255, "num_tokens": 218098152.0, "reward": 1.064453125, "reward_std": 0.34374499320983887, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.4949444830417633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 6207.0, "completions/max_terminated_length": 6207.0, "completions/mean_length": 837.673828125, "completions/mean_terminated_length": 839.3131103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 293.0, "epoch": 0.8520825699279345, "grad_norm": 0.32954076563755397, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 218603585.0, "reward": 1.09375, "reward_std": 0.24614398181438446, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4900552034378052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.984375, "rewards/soft_format_reward/std": 0.12414088100194931, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 749.869140625, "completions/mean_terminated_length": 751.3366088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 269.0, "epoch": 0.8540368877488702, "grad_norm": 0.299203277143821, "learning_rate": 1e-06, "loss": -0.0143, "num_tokens": 219059486.0, "reward": 1.126953125, "reward_std": 0.23192954063415527, "rewards/accuracy_reward/mean": 0.62890625, "rewards/accuracy_reward/std": 0.4835699498653412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.99609375, "rewards/soft_format_reward/std": 0.06243881583213806, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 5557.0, "completions/max_terminated_length": 5557.0, "completions/mean_length": 937.2109375, "completions/mean_terminated_length": 940.8863525390625, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.8559912055698058, "grad_norm": 0.302322572675226, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 219601418.0, "reward": 1.2060546875, "reward_std": 0.2617872357368469, "rewards/accuracy_reward/mean": 0.7109375, "rewards/accuracy_reward/std": 0.45377036929130554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 6134.0, "completions/max_terminated_length": 6134.0, "completions/mean_length": 976.826171875, "completions/mean_terminated_length": 998.2734375, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.8579455233907414, "grad_norm": 0.3719115885092292, "learning_rate": 1e-06, "loss": -0.0254, "num_tokens": 220170401.0, "reward": 0.9501953125, "reward_std": 0.37683263421058655, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4995105266571045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.962890625, "rewards/soft_format_reward/std": 0.18921469151973724, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 7540.0, "completions/max_terminated_length": 7540.0, "completions/mean_length": 1117.2578125, "completions/mean_terminated_length": 1121.6392822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 560.0, "epoch": 0.8598998412116771, "grad_norm": 0.38724736233093465, "learning_rate": 1e-06, "loss": -0.0202, "num_tokens": 220813669.0, "reward": 0.8701171875, "reward_std": 0.3525937497615814, "rewards/accuracy_reward/mean": 0.384765625, "rewards/accuracy_reward/std": 0.4870156943798065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.970703125, "rewards/soft_format_reward/std": 0.16880230605602264, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 1055.904296875, "completions/mean_terminated_length": 1060.045166015625, "completions/min_length": 0.0, "completions/min_terminated_length": 523.0, "epoch": 0.8618541590326126, "grad_norm": 0.5770145242402152, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 221425540.0, "reward": 0.9208984375, "reward_std": 0.33073461055755615, "rewards/accuracy_reward/mean": 0.443359375, "rewards/accuracy_reward/std": 0.49726733565330505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.955078125, "rewards/soft_format_reward/std": 0.20733514428138733, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3204.0, "completions/max_terminated_length": 3204.0, "completions/mean_length": 1063.28125, "completions/mean_terminated_length": 1071.653564453125, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.8638084768535483, "grad_norm": 0.632409719274583, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 222045060.0, "reward": 0.8525390625, "reward_std": 0.35422688722610474, "rewards/accuracy_reward/mean": 0.392578125, "rewards/accuracy_reward/std": 0.4888018071651459, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.919921875, "rewards/soft_format_reward/std": 0.271679550409317, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 6647.0, "completions/max_terminated_length": 6647.0, "completions/mean_length": 953.896484375, "completions/mean_terminated_length": 967.1188354492188, "completions/min_length": 0.0, "completions/min_terminated_length": 402.0, "epoch": 0.8657627946744839, "grad_norm": 0.5184286136801499, "learning_rate": 1e-06, "loss": -0.0515, "num_tokens": 222602047.0, "reward": 0.8583984375, "reward_std": 0.3725011944770813, "rewards/accuracy_reward/mean": 0.416015625, "rewards/accuracy_reward/std": 0.493378221988678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.884765625, "rewards/soft_format_reward/std": 0.3196168541908264, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 5605.0, "completions/max_terminated_length": 5605.0, "completions/mean_length": 910.240234375, "completions/mean_terminated_length": 937.7122192382812, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.8677171124954196, "grad_norm": 1.1610365297434613, "learning_rate": 1e-06, "loss": -0.076, "num_tokens": 223138042.0, "reward": 0.755859375, "reward_std": 0.4233691096305847, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4699897766113281, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.85546875, "rewards/soft_format_reward/std": 0.35197147727012634, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 7396.0, "completions/max_terminated_length": 7396.0, "completions/mean_length": 834.078125, "completions/mean_terminated_length": 878.6995849609375, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.8696714303163552, "grad_norm": 1.228348619301139, "learning_rate": 1e-06, "loss": -0.1206, "num_tokens": 223628466.0, "reward": 0.6611328125, "reward_std": 0.4097694754600525, "rewards/accuracy_reward/mean": 0.271484375, "rewards/accuracy_reward/std": 0.44516023993492126, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.779296875, "rewards/soft_format_reward/std": 0.4151262938976288, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 5346.0, "completions/max_terminated_length": 5346.0, "completions/mean_length": 820.140625, "completions/mean_terminated_length": 864.0164184570312, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.8716257481372909, "grad_norm": 0.9524625747029093, "learning_rate": 1e-06, "loss": -0.1072, "num_tokens": 224117290.0, "reward": 0.7109375, "reward_std": 0.49370861053466797, "rewards/accuracy_reward/mean": 0.3203125, "rewards/accuracy_reward/std": 0.4670529365539551, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.78125, "rewards/soft_format_reward/std": 0.41380295157432556, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 6239.0, "completions/max_terminated_length": 6239.0, "completions/mean_length": 811.419921875, "completions/mean_terminated_length": 874.625244140625, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.8735800659582265, "grad_norm": 1.3311003300904964, "learning_rate": 1e-06, "loss": -0.105, "num_tokens": 224603297.0, "reward": 0.5322265625, "reward_std": 0.42388588190078735, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.693359375, "rewards/soft_format_reward/std": 0.4615498185157776, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 7076.0, "completions/max_terminated_length": 7076.0, "completions/mean_length": 797.146484375, "completions/mean_terminated_length": 839.7921752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 196.0, "epoch": 0.875534383779162, "grad_norm": 1.7947968239974508, "learning_rate": 1e-06, "loss": -0.1187, "num_tokens": 225086988.0, "reward": 0.58203125, "reward_std": 0.4251861572265625, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41643625497817993, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.71875, "rewards/soft_format_reward/std": 0.45004892349243164, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 7496.0, "completions/max_terminated_length": 7496.0, "completions/mean_length": 729.158203125, "completions/mean_terminated_length": 796.0106811523438, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.8774887016000977, "grad_norm": 1.7079583135439507, "learning_rate": 1e-06, "loss": -0.1622, "num_tokens": 225540973.0, "reward": 0.66796875, "reward_std": 0.4337148666381836, "rewards/accuracy_reward/mean": 0.294921875, "rewards/accuracy_reward/std": 0.4564536213874817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.74609375, "rewards/soft_format_reward/std": 0.43567025661468506, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 6698.0, "completions/max_terminated_length": 6698.0, "completions/mean_length": 728.2890625, "completions/mean_terminated_length": 798.466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.8794430194210333, "grad_norm": 1.5059900902257461, "learning_rate": 1e-06, "loss": -0.1841, "num_tokens": 225989025.0, "reward": 0.70703125, "reward_std": 0.3947368264198303, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.4607250988483429, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.8046875, "rewards/soft_format_reward/std": 0.3968288004398346, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 6558.0, "completions/max_terminated_length": 6558.0, "completions/mean_length": 769.501953125, "completions/mean_terminated_length": 810.668701171875, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.881397337241969, "grad_norm": 1.7148878153209133, "learning_rate": 1e-06, "loss": -0.0692, "num_tokens": 226456242.0, "reward": 0.9013671875, "reward_std": 0.4203614592552185, "rewards/accuracy_reward/mean": 0.45703125, "rewards/accuracy_reward/std": 0.49863746762275696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.888671875, "rewards/soft_format_reward/std": 0.31484565138816833, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 4988.0, "completions/max_terminated_length": 4988.0, "completions/mean_length": 842.943359375, "completions/mean_terminated_length": 878.9959716796875, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.8833516550629046, "grad_norm": 1.7103988708178688, "learning_rate": 1e-06, "loss": -0.0979, "num_tokens": 226964917.0, "reward": 0.83984375, "reward_std": 0.4147023558616638, "rewards/accuracy_reward/mean": 0.388671875, "rewards/accuracy_reward/std": 0.4879252314567566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.90234375, "rewards/soft_format_reward/std": 0.29713961482048035, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 6793.0, "completions/max_terminated_length": 6793.0, "completions/mean_length": 755.958984375, "completions/mean_terminated_length": 804.6798095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.8853059728838403, "grad_norm": 1.2153991907420474, "learning_rate": 1e-06, "loss": -0.1361, "num_tokens": 227430064.0, "reward": 0.728515625, "reward_std": 0.41981327533721924, "rewards/accuracy_reward/mean": 0.29296875, "rewards/accuracy_reward/std": 0.455569326877594, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.87109375, "rewards/soft_format_reward/std": 0.33542385697364807, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 6801.0, "completions/max_terminated_length": 6801.0, "completions/mean_length": 829.00390625, "completions/mean_terminated_length": 866.2244262695312, "completions/min_length": 0.0, "completions/min_terminated_length": 328.0, "epoch": 0.8872602907047759, "grad_norm": 1.0143662061273302, "learning_rate": 1e-06, "loss": -0.0666, "num_tokens": 227941026.0, "reward": 0.7861328125, "reward_std": 0.35778701305389404, "rewards/accuracy_reward/mean": 0.330078125, "rewards/accuracy_reward/std": 0.47070086002349854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.912109375, "rewards/soft_format_reward/std": 0.2834126651287079, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 7529.0, "completions/max_terminated_length": 7529.0, "completions/mean_length": 816.46484375, "completions/mean_terminated_length": 867.2822265625, "completions/min_length": 0.0, "completions/min_terminated_length": 314.0, "epoch": 0.8892146085257115, "grad_norm": 1.74482817636668, "learning_rate": 1e-06, "loss": -0.1584, "num_tokens": 228430320.0, "reward": 0.7958984375, "reward_std": 0.42428719997406006, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.4754233956336975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.904296875, "rewards/soft_format_reward/std": 0.2944713830947876, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 7920.0, "completions/max_terminated_length": 7920.0, "completions/mean_length": 740.548828125, "completions/mean_terminated_length": 776.96923828125, "completions/min_length": 0.0, "completions/min_terminated_length": 312.0, "epoch": 0.8911689263466471, "grad_norm": 1.4782964756227825, "learning_rate": 1e-06, "loss": -0.1101, "num_tokens": 228876761.0, "reward": 0.712890625, "reward_std": 0.30588382482528687, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.43343618512153625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.92578125, "rewards/soft_format_reward/std": 0.2623828947544098, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 6767.0, "completions/max_terminated_length": 6767.0, "completions/mean_length": 855.7265625, "completions/mean_terminated_length": 878.0200805664062, "completions/min_length": 0.0, "completions/min_terminated_length": 229.0, "epoch": 0.8931232441675827, "grad_norm": 0.7929241327750952, "learning_rate": 1e-06, "loss": -0.0717, "num_tokens": 229395565.0, "reward": 0.8359375, "reward_std": 0.3727160096168518, "rewards/accuracy_reward/mean": 0.3515625, "rewards/accuracy_reward/std": 0.4779251217842102, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.96875, "rewards/soft_format_reward/std": 0.17416280508041382, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 8164.0, "completions/max_terminated_length": 8164.0, "completions/mean_length": 845.150390625, "completions/mean_terminated_length": 860.2723388671875, "completions/min_length": 0.0, "completions/min_terminated_length": 340.0, "epoch": 0.8950775619885184, "grad_norm": 0.7768193705441528, "learning_rate": 1e-06, "loss": -0.0476, "num_tokens": 229903898.0, "reward": 0.9296875, "reward_std": 0.40889424085617065, "rewards/accuracy_reward/mean": 0.443359375, "rewards/accuracy_reward/std": 0.49726733565330505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.97265625, "rewards/soft_format_reward/std": 0.16324250400066376, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 5151.0, "completions/max_terminated_length": 5151.0, "completions/mean_length": 929.96484375, "completions/mean_terminated_length": 939.1361083984375, "completions/min_length": 0.0, "completions/min_terminated_length": 213.0, "epoch": 0.897031879809454, "grad_norm": 0.7104551416698567, "learning_rate": 1e-06, "loss": -0.0456, "num_tokens": 230452392.0, "reward": 0.732421875, "reward_std": 0.3058362901210785, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.42882615327835083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 3596.0, "completions/max_terminated_length": 3596.0, "completions/mean_length": 830.177734375, "completions/mean_terminated_length": 848.4052124023438, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.8989861976303897, "grad_norm": 0.7393593831490892, "learning_rate": 1e-06, "loss": -0.0349, "num_tokens": 230953843.0, "reward": 0.880859375, "reward_std": 0.3243863880634308, "rewards/accuracy_reward/mean": 0.396484375, "rewards/accuracy_reward/std": 0.4896455705165863, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.96875, "rewards/soft_format_reward/std": 0.17416280508041382, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 6314.0, "completions/max_terminated_length": 6314.0, "completions/mean_length": 907.875, "completions/mean_terminated_length": 916.8284301757812, "completions/min_length": 0.0, "completions/min_terminated_length": 220.0, "epoch": 0.9009405154513253, "grad_norm": 0.5389355879892459, "learning_rate": 1e-06, "loss": -0.0437, "num_tokens": 231491571.0, "reward": 0.9267578125, "reward_std": 0.33239421248435974, "rewards/accuracy_reward/mean": 0.435546875, "rewards/accuracy_reward/std": 0.49631330370903015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.982421875, "rewards/soft_format_reward/std": 0.13154059648513794, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 6925.0, "completions/max_terminated_length": 6925.0, "completions/mean_length": 864.8828125, "completions/mean_terminated_length": 883.8722534179688, "completions/min_length": 0.0, "completions/min_terminated_length": 249.0, "epoch": 0.9028948332722609, "grad_norm": 0.6214866889272922, "learning_rate": 1e-06, "loss": -0.0585, "num_tokens": 232005927.0, "reward": 0.8662109375, "reward_std": 0.3578924536705017, "rewards/accuracy_reward/mean": 0.380859375, "rewards/accuracy_reward/std": 0.48607301712036133, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.970703125, "rewards/soft_format_reward/std": 0.16880230605602264, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5988.0, "completions/max_terminated_length": 5988.0, "completions/mean_length": 878.0234375, "completions/mean_terminated_length": 891.9603881835938, "completions/min_length": 0.0, "completions/min_terminated_length": 226.0, "epoch": 0.9048491510931965, "grad_norm": 0.5965500887413218, "learning_rate": 1e-06, "loss": -0.053, "num_tokens": 232529299.0, "reward": 0.9443359375, "reward_std": 0.38830047845840454, "rewards/accuracy_reward/mean": 0.455078125, "rewards/accuracy_reward/std": 0.4984649419784546, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.978515625, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 2278.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 734.92578125, "completions/mean_terminated_length": 742.173583984375, "completions/min_length": 0.0, "completions/min_terminated_length": 213.0, "epoch": 0.9068034689141322, "grad_norm": 0.6019906833328859, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 232966173.0, "reward": 1.1201171875, "reward_std": 0.2919842600822449, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4845963716506958, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 2833.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 776.91796875, "completions/mean_terminated_length": 787.6871337890625, "completions/min_length": 0.0, "completions/min_terminated_length": 288.0, "epoch": 0.9087577867350678, "grad_norm": 0.5215486200911099, "learning_rate": 1e-06, "loss": -0.0437, "num_tokens": 233428259.0, "reward": 1.07421875, "reward_std": 0.2650104761123657, "rewards/accuracy_reward/mean": 0.583984375, "rewards/accuracy_reward/std": 0.493378221988678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 7633.0, "completions/max_terminated_length": 7633.0, "completions/mean_length": 762.11328125, "completions/mean_terminated_length": 774.2103881835938, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.9107121045560034, "grad_norm": 0.5741432760093643, "learning_rate": 1e-06, "loss": -0.0257, "num_tokens": 233880493.0, "reward": 1.072265625, "reward_std": 0.37611234188079834, "rewards/accuracy_reward/mean": 0.580078125, "rewards/accuracy_reward/std": 0.4940285086631775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.984375, "rewards/soft_format_reward/std": 0.12414088100194931, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 7668.0, "completions/max_terminated_length": 7668.0, "completions/mean_length": 717.677734375, "completions/mean_terminated_length": 743.8279418945312, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.9126664223769391, "grad_norm": 0.6780032338708568, "learning_rate": 1e-06, "loss": -0.0732, "num_tokens": 234307736.0, "reward": 0.943359375, "reward_std": 0.3413737416267395, "rewards/accuracy_reward/mean": 0.462890625, "rewards/accuracy_reward/std": 0.4991086423397064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9609375, "rewards/soft_format_reward/std": 0.1939331740140915, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 5676.0, "completions/max_terminated_length": 5676.0, "completions/mean_length": 669.8046875, "completions/mean_terminated_length": 695.61865234375, "completions/min_length": 0.0, "completions/min_terminated_length": 367.0, "epoch": 0.9146207401978746, "grad_norm": 0.9422764202961365, "learning_rate": 1e-06, "loss": -0.0675, "num_tokens": 234715908.0, "reward": 1.029296875, "reward_std": 0.3299490213394165, "rewards/accuracy_reward/mean": 0.548828125, "rewards/accuracy_reward/std": 0.498096764087677, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9609375, "rewards/soft_format_reward/std": 0.1939331740140915, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 5438.0, "completions/max_terminated_length": 5438.0, "completions/mean_length": 777.9296875, "completions/mean_terminated_length": 791.848876953125, "completions/min_length": 0.0, "completions/min_terminated_length": 272.0, "epoch": 0.9165750580188103, "grad_norm": 0.5789160710563247, "learning_rate": 1e-06, "loss": -0.0271, "num_tokens": 235178544.0, "reward": 1.107421875, "reward_std": 0.31259697675704956, "rewards/accuracy_reward/mean": 0.6171875, "rewards/accuracy_reward/std": 0.486548513174057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 7111.0, "completions/max_terminated_length": 7111.0, "completions/mean_length": 816.837890625, "completions/mean_terminated_length": 850.0426635742188, "completions/min_length": 0.0, "completions/min_terminated_length": 334.0, "epoch": 0.9185293758397459, "grad_norm": 0.5758210097295741, "learning_rate": 1e-06, "loss": -0.0602, "num_tokens": 235664669.0, "reward": 0.9736328125, "reward_std": 0.2642187774181366, "rewards/accuracy_reward/mean": 0.4921875, "rewards/accuracy_reward/std": 0.5004279017448425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.962890625, "rewards/soft_format_reward/std": 0.18921469151973724, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 7104.0, "completions/max_terminated_length": 7104.0, "completions/mean_length": 674.55859375, "completions/mean_terminated_length": 689.3692626953125, "completions/min_length": 0.0, "completions/min_terminated_length": 295.0, "epoch": 0.9204836936606816, "grad_norm": 0.6497836324935389, "learning_rate": 1e-06, "loss": -0.0393, "num_tokens": 236069947.0, "reward": 1.0791015625, "reward_std": 0.3011692762374878, "rewards/accuracy_reward/mean": 0.591796875, "rewards/accuracy_reward/std": 0.49198177456855774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.974609375, "rewards/soft_format_reward/std": 0.15746226906776428, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 5687.0, "completions/max_terminated_length": 5687.0, "completions/mean_length": 772.49609375, "completions/mean_terminated_length": 789.4570922851562, "completions/min_length": 0.0, "completions/min_terminated_length": 283.0, "epoch": 0.9224380114816172, "grad_norm": 0.5783808120324807, "learning_rate": 1e-06, "loss": -0.0454, "num_tokens": 236528537.0, "reward": 1.0234375, "reward_std": 0.30921226739883423, "rewards/accuracy_reward/mean": 0.53515625, "rewards/accuracy_reward/std": 0.49925029277801514, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9765625, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3878.0, "completions/max_terminated_length": 3878.0, "completions/mean_length": 701.900390625, "completions/mean_terminated_length": 736.4200439453125, "completions/min_length": 0.0, "completions/min_terminated_length": 264.0, "epoch": 0.9243923293025528, "grad_norm": 0.5515443128324211, "learning_rate": 1e-06, "loss": -0.0806, "num_tokens": 236947494.0, "reward": 1.03125, "reward_std": 0.3871540427207947, "rewards/accuracy_reward/mean": 0.556640625, "rewards/accuracy_reward/std": 0.49726733565330505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.94921875, "rewards/soft_format_reward/std": 0.21976542472839355, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 7876.0, "completions/max_terminated_length": 7876.0, "completions/mean_length": 780.8359375, "completions/mean_terminated_length": 847.0084838867188, "completions/min_length": 0.0, "completions/min_terminated_length": 311.0, "epoch": 0.9263466471234885, "grad_norm": 0.9965213328415409, "learning_rate": 1e-06, "loss": -0.1591, "num_tokens": 237419074.0, "reward": 0.994140625, "reward_std": 0.3784325122833252, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4989593029022217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.91015625, "rewards/soft_format_reward/std": 0.2862374484539032, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 7124.0, "completions/max_terminated_length": 7124.0, "completions/mean_length": 888.017578125, "completions/mean_terminated_length": 922.2413330078125, "completions/min_length": 0.0, "completions/min_terminated_length": 371.0, "epoch": 0.928300964944424, "grad_norm": 0.9802810434685564, "learning_rate": 1e-06, "loss": -0.0565, "num_tokens": 237948235.0, "reward": 0.9951171875, "reward_std": 0.3843446969985962, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5002445578575134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.958984375, "rewards/soft_format_reward/std": 0.19852031767368317, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 8153.0, "completions/max_terminated_length": 8153.0, "completions/mean_length": 762.35546875, "completions/mean_terminated_length": 839.4107666015625, "completions/min_length": 0.0, "completions/min_terminated_length": 372.0, "epoch": 0.9302552827653597, "grad_norm": 0.8744620153789759, "learning_rate": 1e-06, "loss": -0.1564, "num_tokens": 238407985.0, "reward": 0.982421875, "reward_std": 0.4088676869869232, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4995105266571045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.90234375, "rewards/soft_format_reward/std": 0.29713961482048035, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 7231.0, "completions/max_terminated_length": 7231.0, "completions/mean_length": 794.966796875, "completions/mean_terminated_length": 840.9566040039062, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.9322096005862953, "grad_norm": 0.6613375246696646, "learning_rate": 1e-06, "loss": -0.1236, "num_tokens": 238878544.0, "reward": 1.0673828125, "reward_std": 0.2954113483428955, "rewards/accuracy_reward/mean": 0.595703125, "rewards/accuracy_reward/std": 0.4912354052066803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.943359375, "rewards/soft_format_reward/std": 0.23138070106506348, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 7122.0, "completions/max_terminated_length": 7122.0, "completions/mean_length": 810.68359375, "completions/mean_terminated_length": 875.675048828125, "completions/min_length": 0.0, "completions/min_terminated_length": 339.0, "epoch": 0.934163918407231, "grad_norm": 0.9317973658296996, "learning_rate": 1e-06, "loss": -0.1198, "num_tokens": 239358910.0, "reward": 0.95703125, "reward_std": 0.46614953875541687, "rewards/accuracy_reward/mean": 0.49609375, "rewards/accuracy_reward/std": 0.5004737377166748, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.921875, "rewards/soft_format_reward/std": 0.26863065361976624, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 8120.0, "completions/max_terminated_length": 8120.0, "completions/mean_length": 823.8203125, "completions/mean_terminated_length": 897.4382934570312, "completions/min_length": 0.0, "completions/min_terminated_length": 339.0, "epoch": 0.9361182362281666, "grad_norm": 0.9342506130221416, "learning_rate": 1e-06, "loss": -0.1382, "num_tokens": 239844674.0, "reward": 0.9140625, "reward_std": 0.39317864179611206, "rewards/accuracy_reward/mean": 0.45703125, "rewards/accuracy_reward/std": 0.49863746762275696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9140625, "rewards/soft_format_reward/std": 0.28054583072662354, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 7618.0, "completions/max_terminated_length": 7618.0, "completions/mean_length": 835.5546875, "completions/mean_terminated_length": 894.9874267578125, "completions/min_length": 0.0, "completions/min_terminated_length": 255.0, "epoch": 0.9380725540491023, "grad_norm": 0.5430348666668907, "learning_rate": 1e-06, "loss": -0.1157, "num_tokens": 240339150.0, "reward": 0.951171875, "reward_std": 0.36654937267303467, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5002445578575134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.93359375, "rewards/soft_format_reward/std": 0.2492343932390213, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 8184.0, "completions/max_terminated_length": 8184.0, "completions/mean_length": 922.828125, "completions/mean_terminated_length": 943.08984375, "completions/min_length": 0.0, "completions/min_terminated_length": 352.0, "epoch": 0.9400268718700379, "grad_norm": 0.6480506259793776, "learning_rate": 1e-06, "loss": -0.0145, "num_tokens": 240874614.0, "reward": 0.9296875, "reward_std": 0.34232279658317566, "rewards/accuracy_reward/mean": 0.44140625, "rewards/accuracy_reward/std": 0.4970405399799347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9765625, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 7655.0, "completions/max_terminated_length": 7655.0, "completions/mean_length": 949.119140625, "completions/mean_terminated_length": 969.9580688476562, "completions/min_length": 0.0, "completions/min_terminated_length": 383.0, "epoch": 0.9419811896909734, "grad_norm": 0.5093417529520246, "learning_rate": 1e-06, "loss": -0.0184, "num_tokens": 241426579.0, "reward": 0.9365234375, "reward_std": 0.3903234004974365, "rewards/accuracy_reward/mean": 0.447265625, "rewards/accuracy_reward/std": 0.4976975917816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.978515625, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 7506.0, "completions/max_terminated_length": 7506.0, "completions/mean_length": 969.109375, "completions/mean_terminated_length": 1004.4210815429688, "completions/min_length": 0.0, "completions/min_terminated_length": 396.0, "epoch": 0.9439355075119091, "grad_norm": 0.6694203730648159, "learning_rate": 1e-06, "loss": -0.0575, "num_tokens": 241986443.0, "reward": 0.9892578125, "reward_std": 0.4110221266746521, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5004279017448425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.962890625, "rewards/soft_format_reward/std": 0.18921469151973724, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 5962.0, "completions/max_terminated_length": 5962.0, "completions/mean_length": 784.2421875, "completions/mean_terminated_length": 821.1288452148438, "completions/min_length": 0.0, "completions/min_terminated_length": 326.0, "epoch": 0.9458898253328447, "grad_norm": 0.46927479618168727, "learning_rate": 1e-06, "loss": -0.0903, "num_tokens": 242447735.0, "reward": 1.1123046875, "reward_std": 0.32275721430778503, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.4814152419567108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.951171875, "rewards/soft_format_reward/std": 0.2157193273305893, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 5921.0, "completions/max_terminated_length": 5921.0, "completions/mean_length": 893.67578125, "completions/mean_terminated_length": 916.9579467773438, "completions/min_length": 0.0, "completions/min_terminated_length": 373.0, "epoch": 0.9478441431537804, "grad_norm": 0.535487662744918, "learning_rate": 1e-06, "loss": -0.0351, "num_tokens": 242969537.0, "reward": 0.857421875, "reward_std": 0.3181568682193756, "rewards/accuracy_reward/mean": 0.37109375, "rewards/accuracy_reward/std": 0.4835699498653412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.97265625, "rewards/soft_format_reward/std": 0.16324250400066376, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2214.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 941.484375, "completions/mean_terminated_length": 960.2390747070312, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.949798460974716, "grad_norm": 0.46893495952048375, "learning_rate": 1e-06, "loss": -0.0519, "num_tokens": 243518537.0, "reward": 1.146484375, "reward_std": 0.32323187589645386, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4754233956336975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 7516.0, "completions/max_terminated_length": 7516.0, "completions/mean_length": 965.45703125, "completions/mean_terminated_length": 1008.8040161132812, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.9517527787956517, "grad_norm": 0.6311387421030279, "learning_rate": 1e-06, "loss": -0.0614, "num_tokens": 244086067.0, "reward": 1.11328125, "reward_std": 0.37617602944374084, "rewards/accuracy_reward/mean": 0.634765625, "rewards/accuracy_reward/std": 0.4819667339324951, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.95703125, "rewards/soft_format_reward/std": 0.2029850035905838, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 7478.0, "completions/max_terminated_length": 7478.0, "completions/mean_length": 803.916015625, "completions/mean_terminated_length": 873.89599609375, "completions/min_length": 0.0, "completions/min_terminated_length": 376.0, "epoch": 0.9537070966165873, "grad_norm": 1.0897314012512074, "learning_rate": 1e-06, "loss": -0.1505, "num_tokens": 244558680.0, "reward": 0.916015625, "reward_std": 0.37017822265625, "rewards/accuracy_reward/mean": 0.4609375, "rewards/accuracy_reward/std": 0.4989593029022217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.91015625, "rewards/soft_format_reward/std": 0.2862374484539032, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 7676.0, "completions/max_terminated_length": 7676.0, "completions/mean_length": 804.150390625, "completions/mean_terminated_length": 897.0043334960938, "completions/min_length": 0.0, "completions/min_terminated_length": 346.0, "epoch": 0.955661414437523, "grad_norm": 0.6848232002649514, "learning_rate": 1e-06, "loss": -0.2141, "num_tokens": 245035157.0, "reward": 1.048828125, "reward_std": 0.4050791561603546, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4900552034378052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.89453125, "rewards/soft_format_reward/std": 0.3074568510055542, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 6959.0, "completions/max_terminated_length": 6959.0, "completions/mean_length": 925.359375, "completions/mean_terminated_length": 1025.5064697265625, "completions/min_length": 0.0, "completions/min_terminated_length": 422.0, "epoch": 0.9576157322584585, "grad_norm": 0.5541681981586442, "learning_rate": 1e-06, "loss": -0.1795, "num_tokens": 245574269.0, "reward": 0.8955078125, "reward_std": 0.4295736849308014, "rewards/accuracy_reward/mean": 0.447265625, "rewards/accuracy_reward/std": 0.4976975917816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.896484375, "rewards/soft_format_reward/std": 0.30492907762527466, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 8133.0, "completions/max_terminated_length": 8133.0, "completions/mean_length": 907.287109375, "completions/mean_terminated_length": 951.90771484375, "completions/min_length": 0.0, "completions/min_terminated_length": 333.0, "epoch": 0.9595700500793941, "grad_norm": 0.4985487593719111, "learning_rate": 1e-06, "loss": -0.0809, "num_tokens": 246104832.0, "reward": 1.01171875, "reward_std": 0.3561217486858368, "rewards/accuracy_reward/mean": 0.541015625, "rewards/accuracy_reward/std": 0.49880221486091614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.94140625, "rewards/soft_format_reward/std": 0.23509246110916138, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 7933.0, "completions/max_terminated_length": 7933.0, "completions/mean_length": 795.931640625, "completions/mean_terminated_length": 887.8366088867188, "completions/min_length": 0.0, "completions/min_terminated_length": 344.0, "epoch": 0.9615243679003298, "grad_norm": 0.9344839438192083, "learning_rate": 1e-06, "loss": -0.1672, "num_tokens": 246576221.0, "reward": 0.8955078125, "reward_std": 0.3685035705566406, "rewards/accuracy_reward/mean": 0.44921875, "rewards/accuracy_reward/std": 0.497901052236557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.892578125, "rewards/soft_format_reward/std": 0.30995169281959534, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 7176.0, "completions/max_terminated_length": 7176.0, "completions/mean_length": 822.322265625, "completions/mean_terminated_length": 905.438720703125, "completions/min_length": 0.0, "completions/min_terminated_length": 250.0, "epoch": 0.9634786857212654, "grad_norm": 0.5695672312178879, "learning_rate": 1e-06, "loss": -0.1495, "num_tokens": 247062242.0, "reward": 0.9580078125, "reward_std": 0.33883267641067505, "rewards/accuracy_reward/mean": 0.505859375, "rewards/accuracy_reward/std": 0.5004546642303467, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.904296875, "rewards/soft_format_reward/std": 0.2944713830947876, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 5793.0, "completions/max_terminated_length": 5793.0, "completions/mean_length": 856.931640625, "completions/mean_terminated_length": 900.9219970703125, "completions/min_length": 0.0, "completions/min_terminated_length": 365.0, "epoch": 0.9654330035422011, "grad_norm": 0.4524615129397472, "learning_rate": 1e-06, "loss": -0.0749, "num_tokens": 247560207.0, "reward": 0.9912109375, "reward_std": 0.3110504746437073, "rewards/accuracy_reward/mean": 0.517578125, "rewards/accuracy_reward/std": 0.5001795887947083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.947265625, "rewards/soft_format_reward/std": 0.22372129559516907, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 6020.0, "completions/max_terminated_length": 6020.0, "completions/mean_length": 733.865234375, "completions/mean_terminated_length": 763.6971435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 0.9673873213631367, "grad_norm": 0.5168983525550634, "learning_rate": 1e-06, "loss": -0.0714, "num_tokens": 247996186.0, "reward": 1.109375, "reward_std": 0.34613001346588135, "rewards/accuracy_reward/mean": 0.62890625, "rewards/accuracy_reward/std": 0.4835699498653412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9609375, "rewards/soft_format_reward/std": 0.1939331740140915, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 7130.0, "completions/max_terminated_length": 7130.0, "completions/mean_length": 857.07421875, "completions/mean_terminated_length": 891.9146118164062, "completions/min_length": 0.0, "completions/min_terminated_length": 353.0, "epoch": 0.9693416391840723, "grad_norm": 0.49569564507491365, "learning_rate": 1e-06, "loss": -0.0839, "num_tokens": 248501696.0, "reward": 1.06640625, "reward_std": 0.3585534989833832, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.49304109811782837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9609375, "rewards/soft_format_reward/std": 0.1939331740140915, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 5489.0, "completions/max_terminated_length": 5489.0, "completions/mean_length": 876.552734375, "completions/mean_terminated_length": 894.0139770507812, "completions/min_length": 0.0, "completions/min_terminated_length": 346.0, "epoch": 0.9712959570050079, "grad_norm": 0.6147626643385528, "learning_rate": 1e-06, "loss": -0.0312, "num_tokens": 249010459.0, "reward": 1.0478515625, "reward_std": 0.35256725549697876, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4970405399799347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.978515625, "rewards/soft_format_reward/std": 0.14513419568538666, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 6724.0, "completions/max_terminated_length": 6724.0, "completions/mean_length": 920.267578125, "completions/mean_terminated_length": 949.95361328125, "completions/min_length": 0.0, "completions/min_terminated_length": 342.0, "epoch": 0.9732502748259436, "grad_norm": 0.5261575537054536, "learning_rate": 1e-06, "loss": -0.0535, "num_tokens": 249542932.0, "reward": 0.890625, "reward_std": 0.352202832698822, "rewards/accuracy_reward/mean": 0.408203125, "rewards/accuracy_reward/std": 0.49198177456855774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.96484375, "rewards/soft_format_reward/std": 0.1843547374010086, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 7714.0, "completions/max_terminated_length": 7714.0, "completions/mean_length": 903.548828125, "completions/mean_terminated_length": 965.7975463867188, "completions/min_length": 0.0, "completions/min_terminated_length": 307.0, "epoch": 0.9752045926468792, "grad_norm": 0.4186029173907592, "learning_rate": 1e-06, "loss": -0.1213, "num_tokens": 250068413.0, "reward": 0.9453125, "reward_std": 0.31097647547721863, "rewards/accuracy_reward/mean": 0.478515625, "rewards/accuracy_reward/std": 0.5000267624855042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.93359375, "rewards/soft_format_reward/std": 0.2492343932390213, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 8131.0, "completions/max_terminated_length": 8131.0, "completions/mean_length": 836.07421875, "completions/mean_terminated_length": 880.8024291992188, "completions/min_length": 0.0, "completions/min_terminated_length": 262.0, "epoch": 0.9771589104678148, "grad_norm": 0.46700975891340746, "learning_rate": 1e-06, "loss": -0.0894, "num_tokens": 250565155.0, "reward": 0.990234375, "reward_std": 0.3769418001174927, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5002445578575134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.94921875, "rewards/soft_format_reward/std": 0.21976542472839355, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 7985.0, "completions/max_terminated_length": 7985.0, "completions/mean_length": 798.17578125, "completions/mean_terminated_length": 842.6103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 257.0, "epoch": 0.9791132282887505, "grad_norm": 0.5474339883422088, "learning_rate": 1e-06, "loss": -0.0925, "num_tokens": 251042013.0, "reward": 0.98828125, "reward_std": 0.29229220747947693, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5002445578575134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9453125, "rewards/soft_format_reward/std": 0.2275916188955307, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 8155.0, "completions/max_terminated_length": 8155.0, "completions/mean_length": 842.6640625, "completions/mean_terminated_length": 895.112060546875, "completions/min_length": 0.0, "completions/min_terminated_length": 260.0, "epoch": 0.981067546109686, "grad_norm": 0.5621193326438604, "learning_rate": 1e-06, "loss": -0.1066, "num_tokens": 251542577.0, "reward": 1.01171875, "reward_std": 0.37604188919067383, "rewards/accuracy_reward/mean": 0.541015625, "rewards/accuracy_reward/std": 0.49880221486091614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.94140625, "rewards/soft_format_reward/std": 0.23509246110916138, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 5010.0, "completions/max_terminated_length": 5010.0, "completions/mean_length": 668.14453125, "completions/mean_terminated_length": 688.309814453125, "completions/min_length": 0.0, "completions/min_terminated_length": 310.0, "epoch": 0.9830218639306217, "grad_norm": 0.5498844223977205, "learning_rate": 1e-06, "loss": -0.0264, "num_tokens": 251941115.0, "reward": 1.1494140625, "reward_std": 0.3015892505645752, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4727790653705597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.970703125, "rewards/soft_format_reward/std": 0.16880230605602264, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4417.0, "completions/max_terminated_length": 4417.0, "completions/mean_length": 627.01171875, "completions/mean_terminated_length": 638.2305908203125, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.9849761817515573, "grad_norm": 0.6223520678937557, "learning_rate": 1e-06, "loss": -0.0194, "num_tokens": 252324625.0, "reward": 1.12890625, "reward_std": 0.27855873107910156, "rewards/accuracy_reward/mean": 0.638671875, "rewards/accuracy_reward/std": 0.48085519671440125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 3569.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 774.861328125, "completions/mean_terminated_length": 795.048095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 294.0, "epoch": 0.986930499572493, "grad_norm": 0.528956451291311, "learning_rate": 1e-06, "loss": -0.0397, "num_tokens": 252782186.0, "reward": 1.0947265625, "reward_std": 0.32669395208358765, "rewards/accuracy_reward/mean": 0.607421875, "rewards/accuracy_reward/std": 0.4888018071651459, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.974609375, "rewards/soft_format_reward/std": 0.15746226906776428, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 6655.0, "completions/max_terminated_length": 6655.0, "completions/mean_length": 857.521484375, "completions/mean_terminated_length": 886.9717407226562, "completions/min_length": 0.0, "completions/min_terminated_length": 391.0, "epoch": 0.9888848173934286, "grad_norm": 0.5300826491459741, "learning_rate": 1e-06, "loss": -0.0529, "num_tokens": 253287717.0, "reward": 1.1240234375, "reward_std": 0.35081547498703003, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48028653860092163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.966796875, "rewards/soft_format_reward/std": 0.17934183776378632, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 6549.0, "completions/max_terminated_length": 6549.0, "completions/mean_length": 802.947265625, "completions/mean_terminated_length": 832.2044677734375, "completions/min_length": 0.0, "completions/min_terminated_length": 360.0, "epoch": 0.9908391352143643, "grad_norm": 0.42853276451266314, "learning_rate": 1e-06, "loss": -0.0491, "num_tokens": 253759706.0, "reward": 1.1533203125, "reward_std": 0.2587732672691345, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4699897766113281, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.962890625, "rewards/soft_format_reward/std": 0.18921469151973724, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 8005.0, "completions/max_terminated_length": 8005.0, "completions/mean_length": 748.52734375, "completions/mean_terminated_length": 766.4920654296875, "completions/min_length": 0.0, "completions/min_terminated_length": 364.0, "epoch": 0.9927934530352999, "grad_norm": 0.5564003388206429, "learning_rate": 1e-06, "loss": -0.0374, "num_tokens": 254204776.0, "reward": 1.0625, "reward_std": 0.3248573839664459, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.4949444830417633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.9765625, "rewards/soft_format_reward/std": 0.15143637359142303, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 7709.0, "completions/max_terminated_length": 7709.0, "completions/mean_length": 776.3515625, "completions/mean_terminated_length": 784.0078735351562, "completions/min_length": 0.0, "completions/min_terminated_length": 285.0, "epoch": 0.9947477708562354, "grad_norm": 0.508090925111072, "learning_rate": 1e-06, "loss": -0.0219, "num_tokens": 254664380.0, "reward": 1.1435546875, "reward_std": 0.28476428985595703, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.4779251217842102, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.990234375, "rewards/soft_format_reward/std": 0.09843364357948303, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 8036.0, "completions/max_terminated_length": 8036.0, "completions/mean_length": 699.50390625, "completions/mean_terminated_length": 720.6156616210938, "completions/min_length": 0.0, "completions/min_terminated_length": 254.0, "epoch": 0.9967020886771711, "grad_norm": 0.5967369543635935, "learning_rate": 1e-06, "loss": -0.0371, "num_tokens": 255087710.0, "reward": 1.1630859375, "reward_std": 0.27583375573158264, "rewards/accuracy_reward/mean": 0.677734375, "rewards/accuracy_reward/std": 0.46780112385749817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.970703125, "rewards/soft_format_reward/std": 0.16880230605602264, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.019920318725099584, "completions/max_length": 7975.0, "completions/max_terminated_length": 7975.0, "completions/mean_length": 738.486083984375, "completions/mean_terminated_length": 753.4959106445312, "completions/min_length": 0.0, "completions/min_terminated_length": 266.0, "epoch": 0.9986564064981067, "grad_norm": 0.7423001685047017, "learning_rate": 1e-06, "loss": -0.0269, "num_tokens": 255530192.0, "reward": 1.150390625, "reward_std": 0.3285100758075714, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.4741191864013672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/soft_format_reward/mean": 0.98046875, "rewards/soft_format_reward/std": 0.1385180652141571, "step": 511 }, { "epoch": 0.9986564064981067, "step": 511, "total_flos": 0.0, "train_loss": -0.00574838798383794, "train_runtime": 41865.0435, "train_samples_per_second": 0.391, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 511, "num_input_tokens_seen": 255530192, "num_train_epochs": 1, "save_steps": 52, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }