{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8952551477170994, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 493.005859375, "completions/mean_terminated_length": 483.84088134765625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0017905102954341987, "frac_reward_zero_std": 0.390625, "grad_norm": 0.2757502589429789, "kl": 0.0003304481506347656, "learning_rate": 0.0, "loss": 0.0272, "num_tokens": 557379.0, "reward": 0.05250054970383644, "reward_std": 0.08277664333581924, "rewards/code_reward/mean": 0.0478130541741848, "rewards/code_reward/std": 0.1902095377445221, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.21157780289649963, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 507.130859375, "completions/mean_terminated_length": 504.1154479980469, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0035810205908683975, "frac_reward_zero_std": 0.296875, "grad_norm": 0.2732568020642173, "kl": 0.00031757354736328125, "learning_rate": 3.3333333333333335e-07, "loss": 0.04, "num_tokens": 1129766.0, "reward": 0.06429892778396606, "reward_std": 0.10656341910362244, "rewards/code_reward/mean": 0.06058799847960472, "rewards/code_reward/std": 0.18849444389343262, "rewards/format_reward/mean": 0.037109375, "rewards/format_reward/std": 0.18921469151973724, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 492.716796875, "completions/mean_terminated_length": 492.716796875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.005371530886302597, "frac_reward_zero_std": 0.3125, "grad_norm": 0.27727370042518834, "kl": 0.0003261566162109375, "learning_rate": 6.666666666666667e-07, "loss": 0.0197, "num_tokens": 1691901.0, "reward": 0.08028307557106018, "reward_std": 0.10556691884994507, "rewards/code_reward/mean": 0.0748143196105957, "rewards/code_reward/std": 0.22891442477703094, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.2275916188955307, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 504.494140625, "completions/mean_terminated_length": 500.3745422363281, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.007162041181736795, "frac_reward_zero_std": 0.390625, "grad_norm": 3.947992067205153, "kl": 0.00048732757568359375, "learning_rate": 1.0000000000000002e-06, "loss": 0.0381, "num_tokens": 2269250.0, "reward": 0.061902254819869995, "reward_std": 0.0871427059173584, "rewards/code_reward/mean": 0.057996008545160294, "rewards/code_reward/std": 0.19543209671974182, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.1939331740140915, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 504.240234375, "completions/mean_terminated_length": 501.21917724609375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.008952551477170993, "frac_reward_zero_std": 0.328125, "grad_norm": 0.28881266097079156, "kl": 0.00033283233642578125, "learning_rate": 1.3333333333333334e-06, "loss": 0.0403, "num_tokens": 2826525.0, "reward": 0.06446345895528793, "reward_std": 0.09759357571601868, "rewards/code_reward/mean": 0.05899471044540405, "rewards/code_reward/std": 0.19848643243312836, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.2275916188955307, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 511.806640625, "completions/mean_terminated_length": 496.65679931640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010743061772605193, "frac_reward_zero_std": 0.265625, "grad_norm": 0.28479707790146735, "kl": 0.0003528594970703125, "learning_rate": 1.6666666666666667e-06, "loss": 0.019, "num_tokens": 3400586.0, "reward": 0.06877022236585617, "reward_std": 0.10614904016256332, "rewards/code_reward/mean": 0.06388740986585617, "rewards/code_reward/std": 0.2069348692893982, "rewards/format_reward/mean": 0.048828125, "rewards/format_reward/std": 0.2157193273305893, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 491.537109375, "completions/mean_terminated_length": 485.433349609375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.012533572068039392, "frac_reward_zero_std": 0.3125, "grad_norm": 0.29706076514094143, "kl": 0.0005025863647460938, "learning_rate": 2.0000000000000003e-06, "loss": 0.0191, "num_tokens": 3987997.0, "reward": 0.0673324316740036, "reward_std": 0.07473643124103546, "rewards/code_reward/mean": 0.059129297733306885, "rewards/code_reward/std": 0.2097090482711792, "rewards/format_reward/mean": 0.08203125, "rewards/format_reward/std": 0.2746807038784027, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 486.7734375, "completions/mean_terminated_length": 478.59136962890625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.01432408236347359, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3870343441661702, "kl": 0.0040874481201171875, "learning_rate": 2.3333333333333336e-06, "loss": 0.0443, "num_tokens": 4547745.0, "reward": 0.056895818561315536, "reward_std": 0.08351406455039978, "rewards/code_reward/mean": 0.04361456632614136, "rewards/code_reward/std": 0.16214141249656677, "rewards/format_reward/mean": 0.1328125, "rewards/format_reward/std": 0.33970388770103455, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 412.912109375, "completions/mean_terminated_length": 411.7651672363281, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.01611459265890779, "frac_reward_zero_std": 0.125, "grad_norm": 176.7439540340013, "kl": 2.2675018310546875, "learning_rate": 2.666666666666667e-06, "loss": 0.0536, "num_tokens": 5044092.0, "reward": 0.09792006015777588, "reward_std": 0.12002617865800858, "rewards/code_reward/mean": 0.07174818962812424, "rewards/code_reward/std": 0.2231462150812149, "rewards/format_reward/mean": 0.26171875, "rewards/format_reward/std": 0.44000017642974854, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 420.681640625, "completions/mean_terminated_length": 420.681640625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.017905102954341987, "frac_reward_zero_std": 0.0, "grad_norm": 0.41863024645003977, "kl": 0.004291534423828125, "learning_rate": 3e-06, "loss": 0.0163, "num_tokens": 5550769.0, "reward": 0.1363074779510498, "reward_std": 0.1268329918384552, "rewards/code_reward/mean": 0.09158090502023697, "rewards/code_reward/std": 0.2484017312526703, "rewards/format_reward/mean": 0.447265625, "rewards/format_reward/std": 0.4976975917816162, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 424.7734375, "completions/mean_terminated_length": 415.206298828125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.019695613249776187, "frac_reward_zero_std": 0.03125, "grad_norm": 11.823889399121583, "kl": 0.010284423828125, "learning_rate": 3.3333333333333333e-06, "loss": 0.0444, "num_tokens": 6082733.0, "reward": 0.12078794836997986, "reward_std": 0.10887586325407028, "rewards/code_reward/mean": 0.05360043793916702, "rewards/code_reward/std": 0.1981125921010971, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4699897766113281, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 418.8515625, "completions/mean_terminated_length": 412.4627685546875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.021486123545210387, "frac_reward_zero_std": 0.03125, "grad_norm": 0.396644196267335, "kl": 0.00769805908203125, "learning_rate": 3.6666666666666666e-06, "loss": 0.0586, "num_tokens": 6613265.0, "reward": 0.19388511776924133, "reward_std": 0.12799829244613647, "rewards/code_reward/mean": 0.11556479334831238, "rewards/code_reward/std": 0.28778785467147827, "rewards/format_reward/mean": 0.783203125, "rewards/format_reward/std": 0.4124660789966583, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 438.08984375, "completions/mean_terminated_length": 428.6011962890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.023276633840644583, "frac_reward_zero_std": 0.21875, "grad_norm": 0.33279910493948966, "kl": 0.010040283203125, "learning_rate": 4.000000000000001e-06, "loss": 0.0382, "num_tokens": 7148495.0, "reward": 0.12756900489330292, "reward_std": 0.08163600414991379, "rewards/code_reward/mean": 0.03811588138341904, "rewards/code_reward/std": 0.15678994357585907, "rewards/format_reward/mean": 0.89453125, "rewards/format_reward/std": 0.3074568510055542, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 394.23828125, "completions/mean_terminated_length": 391.001953125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.025067144136078783, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3856134027803413, "kl": 0.022613525390625, "learning_rate": 4.333333333333334e-06, "loss": 0.0412, "num_tokens": 7650905.0, "reward": 0.1375100314617157, "reward_std": 0.08877082914113998, "rewards/code_reward/mean": 0.04200221225619316, "rewards/code_reward/std": 0.1620388776063919, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 392.244140625, "completions/mean_terminated_length": 389.00390625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.02685765443151298, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3547185224581231, "kl": 0.016448974609375, "learning_rate": 4.666666666666667e-06, "loss": 0.0194, "num_tokens": 8160318.0, "reward": 0.19258737564086914, "reward_std": 0.1283767819404602, "rewards/code_reward/mean": 0.09590768814086914, "rewards/code_reward/std": 0.2513069212436676, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 388.904296875, "completions/mean_terminated_length": 388.904296875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.02864816472694718, "frac_reward_zero_std": 0.390625, "grad_norm": 0.3084273486335259, "kl": 0.0166473388671875, "learning_rate": 5e-06, "loss": 0.0115, "num_tokens": 8676933.0, "reward": 0.1948840171098709, "reward_std": 0.10760986804962158, "rewards/code_reward/mean": 0.09703244268894196, "rewards/code_reward/std": 0.2438713163137436, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 372.7578125, "completions/mean_terminated_length": 369.4794616699219, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.03043867502238138, "frac_reward_zero_std": 0.390625, "grad_norm": 0.2828159402478176, "kl": 0.020294189453125, "learning_rate": 4.999952797253148e-06, "loss": 0.0244, "num_tokens": 9172881.0, "reward": 0.1950989067554474, "reward_std": 0.1255243718624115, "rewards/code_reward/mean": 0.09568483382463455, "rewards/code_reward/std": 0.2350296527147293, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1719.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 386.24609375, "completions/mean_terminated_length": 383.6379699707031, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.03222918531781558, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2675854282504111, "kl": 0.02545166015625, "learning_rate": 4.9998111909931225e-06, "loss": 0.0258, "num_tokens": 9647799.0, "reward": 0.23356594145298004, "reward_std": 0.0985180139541626, "rewards/code_reward/mean": 0.13395656645298004, "rewards/code_reward/std": 0.29350709915161133, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 401.91796875, "completions/mean_terminated_length": 401.91796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.03401969561324977, "frac_reward_zero_std": 0.375, "grad_norm": 0.28888715014007377, "kl": 0.032562255859375, "learning_rate": 4.999575187161439e-06, "loss": 0.0091, "num_tokens": 10158533.0, "reward": 0.18839946389198303, "reward_std": 0.08753000944852829, "rewards/code_reward/mean": 0.08859477937221527, "rewards/code_reward/std": 0.21908476948738098, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 393.80078125, "completions/mean_terminated_length": 393.80078125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.03581020590868397, "frac_reward_zero_std": 0.28125, "grad_norm": 0.33070378533105166, "kl": 0.04669189453125, "learning_rate": 4.9992447956603455e-06, "loss": 0.0256, "num_tokens": 10665511.0, "reward": 0.2654585838317871, "reward_std": 0.13921421766281128, "rewards/code_reward/mean": 0.16584917902946472, "rewards/code_reward/std": 0.3116007149219513, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 361.251953125, "completions/mean_terminated_length": 357.9510803222656, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.03760071620411817, "frac_reward_zero_std": 0.34375, "grad_norm": 0.597794591427295, "kl": 0.08099365234375, "learning_rate": 4.998820030352409e-06, "loss": 0.0337, "num_tokens": 11137000.0, "reward": 0.22268709540367126, "reward_std": 0.10968157649040222, "rewards/code_reward/mean": 0.12288239598274231, "rewards/code_reward/std": 0.2778187692165375, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 311.146484375, "completions/mean_terminated_length": 311.146484375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.03939122649955237, "frac_reward_zero_std": 0.34375, "grad_norm": 0.317740537031204, "kl": 0.0445556640625, "learning_rate": 4.998300909059929e-06, "loss": 0.0275, "num_tokens": 11633403.0, "reward": 0.2248891144990921, "reward_std": 0.11126409471035004, "rewards/code_reward/mean": 0.12547504901885986, "rewards/code_reward/std": 0.2648424804210663, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 277.6875, "completions/mean_terminated_length": 276.9510803222656, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.04118173679498657, "frac_reward_zero_std": 0.28125, "grad_norm": 26292.62749312279, "kl": 478.03466796875, "learning_rate": 4.997687453564198e-06, "loss": 4.8135, "num_tokens": 12085971.0, "reward": 0.2185674011707306, "reward_std": 0.11440497636795044, "rewards/code_reward/mean": 0.11934866011142731, "rewards/code_reward/std": 0.25521907210350037, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 281.314453125, "completions/mean_terminated_length": 277.48236083984375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.04297224709042077, "frac_reward_zero_std": 0.375, "grad_norm": 77.89418471718855, "kl": 1.48876953125, "learning_rate": 4.9969796896045775e-06, "loss": 0.0472, "num_tokens": 12548356.0, "reward": 0.2309560477733612, "reward_std": 0.11367710679769516, "rewards/code_reward/mean": 0.13134664297103882, "rewards/code_reward/std": 0.2928767502307892, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 269.021484375, "completions/mean_terminated_length": 263.2224426269531, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.044762757385854966, "frac_reward_zero_std": 0.328125, "grad_norm": 1287.435698628336, "kl": 30.04058837890625, "learning_rate": 4.996177646877426e-06, "loss": 0.3531, "num_tokens": 12972551.0, "reward": 0.2321074903011322, "reward_std": 0.07545800507068634, "rewards/code_reward/mean": 0.13347464799880981, "rewards/code_reward/std": 0.27745237946510315, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 275.6171875, "completions/mean_terminated_length": 271.76080322265625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.046553267681289166, "frac_reward_zero_std": 0.28125, "grad_norm": 4.9267677052405325, "kl": 0.07440185546875, "learning_rate": 4.995281359034851e-06, "loss": 0.0433, "num_tokens": 13423307.0, "reward": 0.20034222304821014, "reward_std": 0.10246343910694122, "rewards/code_reward/mean": 0.10112347453832626, "rewards/code_reward/std": 0.2486242651939392, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 264.79296875, "completions/mean_terminated_length": 260.7658996582031, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.048343777976723366, "frac_reward_zero_std": 0.40625, "grad_norm": 7966.125450723681, "kl": 260.125, "learning_rate": 4.994290863683296e-06, "loss": 2.6186, "num_tokens": 13867081.0, "reward": 0.18631170690059662, "reward_std": 0.08250073343515396, "rewards/code_reward/mean": 0.0882648229598999, "rewards/code_reward/std": 0.2319841980934143, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 273.697265625, "completions/mean_terminated_length": 264.2048034667969, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.050134288272157566, "frac_reward_zero_std": 0.234375, "grad_norm": 51.89013340529153, "kl": 3.744140625, "learning_rate": 4.99320620238196e-06, "loss": 0.083, "num_tokens": 14322838.0, "reward": 0.2373797446489334, "reward_std": 0.13140062987804413, "rewards/code_reward/mean": 0.1403094232082367, "rewards/code_reward/std": 0.2761482894420624, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 264.224609375, "completions/mean_terminated_length": 252.68345642089844, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.051924798567591766, "frac_reward_zero_std": 0.1875, "grad_norm": 2358.3856214808316, "kl": 82.66650390625, "learning_rate": 4.99202742064106e-06, "loss": 0.8776, "num_tokens": 14752641.0, "reward": 0.27542197704315186, "reward_std": 0.16001757979393005, "rewards/code_reward/mean": 0.1785469949245453, "rewards/code_reward/std": 0.3124798536300659, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 243.6171875, "completions/mean_terminated_length": 237.58401489257812, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.05371530886302596, "frac_reward_zero_std": 0.234375, "grad_norm": 28.250674683465512, "kl": 0.244140625, "learning_rate": 4.990754567919917e-06, "loss": 0.0554, "num_tokens": 15190213.0, "reward": 0.26493605971336365, "reward_std": 0.1365211308002472, "rewards/code_reward/mean": 0.1674751192331314, "rewards/code_reward/std": 0.32023707032203674, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.796875, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 245.755859375, "completions/mean_terminated_length": 237.15029907226562, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.05550581915846016, "frac_reward_zero_std": 0.234375, "grad_norm": 40.263633758336184, "kl": 6.42578125, "learning_rate": 4.989387697624881e-06, "loss": 0.1155, "num_tokens": 15615888.0, "reward": 0.23872455954551697, "reward_std": 0.12384220957756042, "rewards/code_reward/mean": 0.1414589285850525, "rewards/code_reward/std": 0.2653982639312744, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.578125, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 234.1640625, "completions/mean_terminated_length": 217.7814483642578, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.05729632945389436, "frac_reward_zero_std": 0.265625, "grad_norm": 844.1044658485268, "kl": 27.8515625, "learning_rate": 4.987926867107095e-06, "loss": 0.3386, "num_tokens": 16022164.0, "reward": 0.262678861618042, "reward_std": 0.148442342877388, "rewards/code_reward/mean": 0.16814762353897095, "rewards/code_reward/std": 0.3275230824947357, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.2275916188955307, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.515625, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 237.966796875, "completions/mean_terminated_length": 224.6881561279297, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.05908683974932856, "frac_reward_zero_std": 0.15625, "grad_norm": 101.24257125417408, "kl": 9.671875, "learning_rate": 4.986372137660078e-06, "loss": 0.1552, "num_tokens": 16441243.0, "reward": 0.23393931984901428, "reward_std": 0.14389869570732117, "rewards/code_reward/mean": 0.1405799239873886, "rewards/code_reward/std": 0.2560943365097046, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.2492343932390213, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 226.853515625, "completions/mean_terminated_length": 214.0987548828125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.06087735004476276, "frac_reward_zero_std": 0.234375, "grad_norm": 45.38369289490912, "kl": 0.3125, "learning_rate": 4.984723574517165e-06, "loss": 0.0716, "num_tokens": 16867304.0, "reward": 0.24111203849315643, "reward_std": 0.10642847418785095, "rewards/code_reward/mean": 0.14677609503269196, "rewards/code_reward/std": 0.28116148710250854, "rewards/format_reward/mean": 0.943359375, "rewards/format_reward/std": 0.23138070106506348, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.609375, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 228.361328125, "completions/mean_terminated_length": 216.56263732910156, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.06266786034019696, "frac_reward_zero_std": 0.28125, "grad_norm": 35.34863632340071, "kl": 1.19091796875, "learning_rate": 4.9829812468487655e-06, "loss": 0.0761, "num_tokens": 17304601.0, "reward": 0.26075777411460876, "reward_std": 0.12714025378227234, "rewards/code_reward/mean": 0.16603121161460876, "rewards/code_reward/std": 0.27683666348457336, "rewards/format_reward/mean": 0.947265625, "rewards/format_reward/std": 0.22372129559516907, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 221.921875, "completions/mean_terminated_length": 214.88211059570312, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.06445837063563116, "frac_reward_zero_std": 0.28125, "grad_norm": 1613.9001589276616, "kl": 103.71875, "learning_rate": 4.981145227759457e-06, "loss": 1.0771, "num_tokens": 17734281.0, "reward": 0.2762853503227234, "reward_std": 0.12064392119646072, "rewards/code_reward/mean": 0.18019157648086548, "rewards/code_reward/std": 0.3396458625793457, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 257.1875, "completions/mean_terminated_length": 220.2776641845703, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.06624888093106536, "frac_reward_zero_std": 0.28125, "grad_norm": 25628.762422436746, "kl": 1421.0, "learning_rate": 4.979215594284924e-06, "loss": 14.3896, "num_tokens": 18166633.0, "reward": 0.2596889138221741, "reward_std": 0.10138504207134247, "rewards/code_reward/mean": 0.16672013700008392, "rewards/code_reward/std": 0.2989679276943207, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.25592297315597534, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 651.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 225.94921875, "completions/mean_terminated_length": 215.35223388671875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.06803939122649955, "frac_reward_zero_std": 0.203125, "grad_norm": 21.00947022467588, "kl": 4.9140625, "learning_rate": 4.977192427388722e-06, "loss": 0.101, "num_tokens": 18581199.0, "reward": 0.24892207980155945, "reward_std": 0.12581925094127655, "rewards/code_reward/mean": 0.15360957384109497, "rewards/code_reward/std": 0.2981409728527069, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21157780289649963, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.703125, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 227.953125, "completions/mean_terminated_length": 215.56591796875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.06982990152193375, "frac_reward_zero_std": 0.265625, "grad_norm": 10.551043694062427, "kl": 2.36328125, "learning_rate": 4.9750758119588824e-06, "loss": 0.0712, "num_tokens": 19002895.0, "reward": 0.23094934225082397, "reward_std": 0.11277034133672714, "rewards/code_reward/mean": 0.13524621725082397, "rewards/code_reward/std": 0.2503395974636078, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.2029850035905838, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 217.794921875, "completions/mean_terminated_length": 212.34132385253906, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.07162041181736795, "frac_reward_zero_std": 0.3125, "grad_norm": 279.487355262755, "kl": 22.897705078125, "learning_rate": 4.972865836804349e-06, "loss": 0.2806, "num_tokens": 19419646.0, "reward": 0.2884724736213684, "reward_std": 0.1347290575504303, "rewards/code_reward/mean": 0.19081620872020721, "rewards/code_reward/std": 0.3314821720123291, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 229.990234375, "completions/mean_terminated_length": 223.26156616210938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.07341092211280215, "frac_reward_zero_std": 0.296875, "grad_norm": 28.481042919503803, "kl": 5.65625, "learning_rate": 4.970562594651254e-06, "loss": 0.1083, "num_tokens": 19857393.0, "reward": 0.21748042106628418, "reward_std": 0.1042231023311615, "rewards/code_reward/mean": 0.12080072611570358, "rewards/code_reward/std": 0.25313013792037964, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.859375, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 211.607421875, "completions/mean_terminated_length": 207.25247192382812, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.07520143240823635, "frac_reward_zero_std": 0.265625, "grad_norm": 55.13506117343325, "kl": 5.0546875, "learning_rate": 4.968166182139026e-06, "loss": 0.0863, "num_tokens": 20269136.0, "reward": 0.23401585221290588, "reward_std": 0.12024495005607605, "rewards/code_reward/mean": 0.1367502212524414, "rewards/code_reward/std": 0.2655896246433258, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.859375, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 222.115234375, "completions/mean_terminated_length": 214.9005889892578, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.07699194270367055, "frac_reward_zero_std": 0.3125, "grad_norm": 21.97078017929822, "kl": 3.167236328125, "learning_rate": 4.9656766998163306e-06, "loss": 0.0617, "num_tokens": 20729611.0, "reward": 0.2545284330844879, "reward_std": 0.13048286736011505, "rewards/code_reward/mean": 0.15687218308448792, "rewards/code_reward/std": 0.2989810109138489, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 222.666015625, "completions/mean_terminated_length": 214.33663940429688, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.07878245299910475, "frac_reward_zero_std": 0.25, "grad_norm": 18.543874285215562, "kl": 1.758056640625, "learning_rate": 4.963094252136865e-06, "loss": 0.0565, "num_tokens": 21151824.0, "reward": 0.24118606746196747, "reward_std": 0.11667320132255554, "rewards/code_reward/mean": 0.14274856448173523, "rewards/code_reward/std": 0.27080079913139343, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.859375, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 226.15625, "completions/mean_terminated_length": 220.52484130859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.08057296329453895, "frac_reward_zero_std": 0.28125, "grad_norm": 10.634917590790273, "kl": 0.47216796875, "learning_rate": 4.960418947454958e-06, "loss": 0.0423, "num_tokens": 21561752.0, "reward": 0.24755977094173431, "reward_std": 0.12999823689460754, "rewards/code_reward/mean": 0.15107539296150208, "rewards/code_reward/std": 0.3009493350982666, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 215.79296875, "completions/mean_terminated_length": 212.1676483154297, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.08236347358997315, "frac_reward_zero_std": 0.21875, "grad_norm": 3.6985996093734346, "kl": 0.9036865234375, "learning_rate": 4.957650898021038e-06, "loss": 0.04, "num_tokens": 21975942.0, "reward": 0.2695898115634918, "reward_std": 0.1318770796060562, "rewards/code_reward/mean": 0.1721288561820984, "rewards/code_reward/std": 0.29333585500717163, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 211.095703125, "completions/mean_terminated_length": 208.2800750732422, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.08415398388540735, "frac_reward_zero_std": 0.28125, "grad_norm": 167.5242469207357, "kl": 21.169921875, "learning_rate": 4.954790219976915e-06, "loss": 0.2446, "num_tokens": 22381079.0, "reward": 0.30382397770881653, "reward_std": 0.14238783717155457, "rewards/code_reward/mean": 0.2057770937681198, "rewards/code_reward/std": 0.3296486437320709, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 230.26171875, "completions/mean_terminated_length": 226.5481414794922, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.08594449418084155, "frac_reward_zero_std": 0.234375, "grad_norm": 18.113411673403668, "kl": 0.6981201171875, "learning_rate": 4.95183703335091e-06, "loss": 0.0593, "num_tokens": 22801453.0, "reward": 0.2330046147108078, "reward_std": 0.11765988171100616, "rewards/code_reward/mean": 0.13456711173057556, "rewards/code_reward/std": 0.2865093946456909, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 219.099609375, "completions/mean_terminated_length": 217.9784393310547, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.08773500447627573, "frac_reward_zero_std": 0.28125, "grad_norm": 6.281449823326317, "kl": 0.9593505859375, "learning_rate": 4.948791462052819e-06, "loss": 0.0312, "num_tokens": 23219672.0, "reward": 0.2044486403465271, "reward_std": 0.10293813794851303, "rewards/code_reward/mean": 0.10698768496513367, "rewards/code_reward/std": 0.2525589168071747, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.08952551477170993, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3635810265515654, "kl": 0.0946044921875, "learning_rate": 4.945653633868716e-06, "loss": 0.0145, "num_tokens": 23632728.0, "reward": 0.27060389518737793, "reward_std": 0.1412232220172882, "rewards/code_reward/mean": 0.17294764518737793, "rewards/code_reward/std": 0.309596985578537, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 204.482421875, "completions/mean_terminated_length": 203.19020080566406, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.09131602506714413, "frac_reward_zero_std": 0.328125, "grad_norm": 112.7611202774303, "kl": 9.71337890625, "learning_rate": 4.942423680455584e-06, "loss": 0.1195, "num_tokens": 24040031.0, "reward": 0.22862176597118378, "reward_std": 0.11090853810310364, "rewards/code_reward/mean": 0.1303795576095581, "rewards/code_reward/std": 0.274039626121521, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 210.65625, "completions/mean_terminated_length": 210.373779296875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.09310653536257833, "frac_reward_zero_std": 0.296875, "grad_norm": 1.4056397891936114, "kl": 0.1268310546875, "learning_rate": 4.939101737335802e-06, "loss": 0.0216, "num_tokens": 24443519.0, "reward": 0.26519978046417236, "reward_std": 0.12825354933738708, "rewards/code_reward/mean": 0.16676226258277893, "rewards/code_reward/std": 0.2940298914909363, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 199.900390625, "completions/mean_terminated_length": 198.75836181640625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.09489704565801253, "frac_reward_zero_std": 0.1875, "grad_norm": 177.64805473320718, "kl": 12.944091796875, "learning_rate": 4.935687943891447e-06, "loss": 0.1538, "num_tokens": 24838284.0, "reward": 0.34263068437576294, "reward_std": 0.1747278869152069, "rewards/code_reward/mean": 0.2441931813955307, "rewards/code_reward/std": 0.36986133456230164, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 198.400390625, "completions/mean_terminated_length": 197.6516571044922, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.09668755595344673, "frac_reward_zero_std": 0.28125, "grad_norm": 3.770937658531838, "kl": 0.1376953125, "learning_rate": 4.932182443358458e-06, "loss": 0.0288, "num_tokens": 25234097.0, "reward": 0.28923773765563965, "reward_std": 0.11404216289520264, "rewards/code_reward/mean": 0.18982365727424622, "rewards/code_reward/std": 0.3193945288658142, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 212.220703125, "completions/mean_terminated_length": 211.1917724609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.09847806624888093, "frac_reward_zero_std": 0.359375, "grad_norm": 204.13929433338043, "kl": 31.0750732421875, "learning_rate": 4.928585382820616e-06, "loss": 0.3253, "num_tokens": 25634306.0, "reward": 0.2951890230178833, "reward_std": 0.12603974342346191, "rewards/code_reward/mean": 0.19636088609695435, "rewards/code_reward/std": 0.32039520144462585, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 209.654296875, "completions/mean_terminated_length": 209.33267211914062, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.10026857654431513, "frac_reward_zero_std": 0.25, "grad_norm": 18.6858339873043, "kl": 3.0994873046875, "learning_rate": 4.924896913203376e-06, "loss": 0.0407, "num_tokens": 26058945.0, "reward": 0.2629796862602234, "reward_std": 0.15527981519699097, "rewards/code_reward/mean": 0.16376091539859772, "rewards/code_reward/std": 0.2910192608833313, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 210.56640625, "completions/mean_terminated_length": 210.17808532714844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.10205908683974933, "frac_reward_zero_std": 0.34375, "grad_norm": 1.0218658230100541, "kl": 0.1678466796875, "learning_rate": 4.921117189267535e-06, "loss": 0.0133, "num_tokens": 26488091.0, "reward": 0.25674957036972046, "reward_std": 0.10721008479595184, "rewards/code_reward/mean": 0.15714019536972046, "rewards/code_reward/std": 0.29161617159843445, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 206.619140625, "completions/mean_terminated_length": 206.619140625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.10384959713518353, "frac_reward_zero_std": 0.3125, "grad_norm": 0.36980077246982473, "kl": 0.097412109375, "learning_rate": 4.917246369602742e-06, "loss": 0.0027, "num_tokens": 26908096.0, "reward": 0.21150203049182892, "reward_std": 0.11922980844974518, "rewards/code_reward/mean": 0.11189265549182892, "rewards/code_reward/std": 0.2609483003616333, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 229.169921875, "completions/mean_terminated_length": 228.8160400390625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.10564010743061773, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3859056831060467, "kl": 0.0926513671875, "learning_rate": 4.9132846166208355e-06, "loss": 0.0067, "num_tokens": 27348831.0, "reward": 0.21673211455345154, "reward_std": 0.11785140633583069, "rewards/code_reward/mean": 0.11790399253368378, "rewards/code_reward/std": 0.24238136410713196, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 210.228515625, "completions/mean_terminated_length": 210.228515625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.10743061772605192, "frac_reward_zero_std": 0.25, "grad_norm": 0.40477508430921005, "kl": 0.1046142578125, "learning_rate": 4.9092320965490365e-06, "loss": 0.0165, "num_tokens": 27813036.0, "reward": 0.21685603260993958, "reward_std": 0.12834230065345764, "rewards/code_reward/mean": 0.11744196712970734, "rewards/code_reward/std": 0.2476741075515747, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 214.443359375, "completions/mean_terminated_length": 214.443359375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.10922112802148612, "frac_reward_zero_std": 0.34375, "grad_norm": 0.43160549023156675, "kl": 0.10009765625, "learning_rate": 4.905088979422971e-06, "loss": 0.0115, "num_tokens": 28229127.0, "reward": 0.24051693081855774, "reward_std": 0.10015859454870224, "rewards/code_reward/mean": 0.14129818975925446, "rewards/code_reward/std": 0.26549696922302246, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 193.115234375, "completions/mean_terminated_length": 193.115234375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.11101163831692032, "frac_reward_zero_std": 0.25, "grad_norm": 0.39976324658751067, "kl": 0.105712890625, "learning_rate": 4.900855439079536e-06, "loss": 0.007, "num_tokens": 28643034.0, "reward": 0.3018861413002014, "reward_std": 0.15776729583740234, "rewards/code_reward/mean": 0.2018861174583435, "rewards/code_reward/std": 0.3306483030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 202.17578125, "completions/mean_terminated_length": 201.67123413085938, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.11280214861235452, "frac_reward_zero_std": 0.28125, "grad_norm": 1.5001723181186795, "kl": 0.438720703125, "learning_rate": 4.8965316531496055e-06, "loss": 0.0296, "num_tokens": 29073156.0, "reward": 0.3028829097747803, "reward_std": 0.13398407399654388, "rewards/code_reward/mean": 0.20288290083408356, "rewards/code_reward/std": 0.3345738649368286, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 197.380859375, "completions/mean_terminated_length": 190.12353515625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.11459265890778872, "frac_reward_zero_std": 0.21875, "grad_norm": 0.4086678921840457, "kl": 0.114501953125, "learning_rate": 4.892117803050578e-06, "loss": 0.0646, "num_tokens": 29480775.0, "reward": 0.28962546586990356, "reward_std": 0.14494024217128754, "rewards/code_reward/mean": 0.19001609086990356, "rewards/code_reward/std": 0.3062056303024292, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 190.546875, "completions/mean_terminated_length": 190.546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.11638316920322292, "frac_reward_zero_std": 0.28125, "grad_norm": 0.38045778339326064, "kl": 0.116943359375, "learning_rate": 4.887614073978761e-06, "loss": 0.0152, "num_tokens": 29881303.0, "reward": 0.2628862261772156, "reward_std": 0.12928998470306396, "rewards/code_reward/mean": 0.16288623213768005, "rewards/code_reward/std": 0.2907693386077881, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 204.990234375, "completions/mean_terminated_length": 204.990234375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.11817367949865712, "frac_reward_zero_std": 0.25, "grad_norm": 0.3780760225962414, "kl": 0.1094970703125, "learning_rate": 4.883020654901609e-06, "loss": 0.0072, "num_tokens": 30300170.0, "reward": 0.2468407154083252, "reward_std": 0.13970966637134552, "rewards/code_reward/mean": 0.14684072136878967, "rewards/code_reward/std": 0.2736409902572632, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 200.439453125, "completions/mean_terminated_length": 200.439453125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.11996418979409132, "frac_reward_zero_std": 0.28125, "grad_norm": 0.38766533173069684, "kl": 0.11572265625, "learning_rate": 4.878337738549785e-06, "loss": 0.0129, "num_tokens": 30706835.0, "reward": 0.2502107322216034, "reward_std": 0.10867178440093994, "rewards/code_reward/mean": 0.15040606260299683, "rewards/code_reward/std": 0.2983626425266266, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 213.07421875, "completions/mean_terminated_length": 213.07421875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.12175470008952552, "frac_reward_zero_std": 0.390625, "grad_norm": 0.3470548350679671, "kl": 0.1080322265625, "learning_rate": 4.873565521409082e-06, "loss": -0.0098, "num_tokens": 31101417.0, "reward": 0.25901976227760315, "reward_std": 0.10301733016967773, "rewards/code_reward/mean": 0.15901973843574524, "rewards/code_reward/std": 0.2845323383808136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 217.470703125, "completions/mean_terminated_length": 217.470703125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.12354521038495972, "frac_reward_zero_std": 0.234375, "grad_norm": 0.37230158811945757, "kl": 0.1055908203125, "learning_rate": 4.868704203712173e-06, "loss": -0.0001, "num_tokens": 31522130.0, "reward": 0.30519065260887146, "reward_std": 0.17540235817432404, "rewards/code_reward/mean": 0.2053859531879425, "rewards/code_reward/std": 0.34222090244293213, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 232.328125, "completions/mean_terminated_length": 232.328125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.12533572068039392, "frac_reward_zero_std": 0.34375, "grad_norm": 0.34852247471285386, "kl": 0.108642578125, "learning_rate": 4.86375398943021e-06, "loss": 0.0063, "num_tokens": 31934250.0, "reward": 0.20363816618919373, "reward_std": 0.09052859246730804, "rewards/code_reward/mean": 0.10383348912000656, "rewards/code_reward/std": 0.21051490306854248, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1058.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 232.376953125, "completions/mean_terminated_length": 232.376953125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.12712623097582812, "frac_reward_zero_std": 0.265625, "grad_norm": 0.36159634366352833, "kl": 0.0992431640625, "learning_rate": 4.858715086264274e-06, "loss": 0.0034, "num_tokens": 32353859.0, "reward": 0.23877419531345367, "reward_std": 0.10723140835762024, "rewards/code_reward/mean": 0.13877418637275696, "rewards/code_reward/std": 0.27055495977401733, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 242.12890625, "completions/mean_terminated_length": 242.12890625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.12891674127126232, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3443851546471206, "kl": 0.0965576171875, "learning_rate": 4.853587705636646e-06, "loss": -0.0037, "num_tokens": 32783221.0, "reward": 0.2585853934288025, "reward_std": 0.16252876818180084, "rewards/code_reward/mean": 0.15878070890903473, "rewards/code_reward/std": 0.26865139603614807, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 272.943359375, "completions/mean_terminated_length": 272.943359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.13070725156669652, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3670158349350539, "kl": 0.111328125, "learning_rate": 4.84837206268195e-06, "loss": -0.002, "num_tokens": 33220296.0, "reward": 0.2545192837715149, "reward_std": 0.13868264853954315, "rewards/code_reward/mean": 0.15471458435058594, "rewards/code_reward/std": 0.2703130841255188, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 306.33984375, "completions/mean_terminated_length": 306.33984375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.13249776186213072, "frac_reward_zero_std": 0.234375, "grad_norm": 0.31713547482599513, "kl": 0.083984375, "learning_rate": 4.8430683762381195e-06, "loss": 0.0042, "num_tokens": 33704078.0, "reward": 0.2758137583732605, "reward_std": 0.13063742220401764, "rewards/code_reward/mean": 0.17659500241279602, "rewards/code_reward/std": 0.29038169980049133, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 314.19921875, "completions/mean_terminated_length": 313.59490966796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13428827215756492, "frac_reward_zero_std": 0.359375, "grad_norm": 0.8949014900369209, "kl": 0.1702880859375, "learning_rate": 4.837676868837213e-06, "loss": 0.0097, "num_tokens": 34175244.0, "reward": 0.2464839667081833, "reward_std": 0.13638471066951752, "rewards/code_reward/mean": 0.1468745768070221, "rewards/code_reward/std": 0.28026896715164185, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 350.69921875, "completions/mean_terminated_length": 344.04315185546875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1360787824529991, "frac_reward_zero_std": 0.21875, "grad_norm": 15.82958547239205, "kl": 4.279541015625, "learning_rate": 4.832197766696085e-06, "loss": 0.0669, "num_tokens": 34668194.0, "reward": 0.24861091375350952, "reward_std": 0.155998095870018, "rewards/code_reward/mean": 0.15036872029304504, "rewards/code_reward/std": 0.2720683217048645, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 356.076171875, "completions/mean_terminated_length": 353.2534484863281, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.1378692927484333, "frac_reward_zero_std": 0.1875, "grad_norm": 2.887195895269878, "kl": 0.393310546875, "learning_rate": 4.826631299706887e-06, "loss": 0.0307, "num_tokens": 35175009.0, "reward": 0.257512629032135, "reward_std": 0.132109135389328, "rewards/code_reward/mean": 0.15907514095306396, "rewards/code_reward/std": 0.25943660736083984, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 360.091796875, "completions/mean_terminated_length": 360.091796875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1396598030438675, "frac_reward_zero_std": 0.25, "grad_norm": 0.29549268488352665, "kl": 0.075439453125, "learning_rate": 4.820977701427424e-06, "loss": 0.0156, "num_tokens": 35654872.0, "reward": 0.22931262850761414, "reward_std": 0.14669179916381836, "rewards/code_reward/mean": 0.13048452138900757, "rewards/code_reward/std": 0.24265612661838531, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 385.08203125, "completions/mean_terminated_length": 381.7618103027344, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.1414503133393017, "frac_reward_zero_std": 0.25, "grad_norm": 180.9231246293245, "kl": 33.723388671875, "learning_rate": 4.81523720907136e-06, "loss": 0.3501, "num_tokens": 36145418.0, "reward": 0.25271138548851013, "reward_std": 0.13186995685100555, "rewards/code_reward/mean": 0.15485982596874237, "rewards/code_reward/std": 0.2770044207572937, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 396.8205261230469, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1432408236347359, "frac_reward_zero_std": 0.1875, "grad_norm": 62.02542595423062, "kl": 20.5653076171875, "learning_rate": 4.809410063498254e-06, "loss": 0.2525, "num_tokens": 36662650.0, "reward": 0.25465670228004456, "reward_std": 0.15056337416172028, "rewards/code_reward/mean": 0.15641450881958008, "rewards/code_reward/std": 0.26933997869491577, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 393.060546875, "completions/mean_terminated_length": 389.8860778808594, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1450313339301701, "frac_reward_zero_std": 0.21875, "grad_norm": 1.865982872116515, "kl": 1.060302734375, "learning_rate": 4.8034965092034656e-06, "loss": 0.0414, "num_tokens": 37165201.0, "reward": 0.23326393961906433, "reward_std": 0.1506243199110031, "rewards/code_reward/mean": 0.1358029991388321, "rewards/code_reward/std": 0.2649879455566406, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 416.056640625, "completions/mean_terminated_length": 412.1591491699219, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1468218442256043, "frac_reward_zero_std": 0.21875, "grad_norm": 18.91115729457296, "kl": 2.3857421875, "learning_rate": 4.797496794307889e-06, "loss": 0.0485, "num_tokens": 37701238.0, "reward": 0.25604677200317383, "reward_std": 0.13119077682495117, "rewards/code_reward/mean": 0.1585858166217804, "rewards/code_reward/std": 0.284221351146698, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 397.462890625, "completions/mean_terminated_length": 395.8137512207031, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1486123545210385, "frac_reward_zero_std": 0.203125, "grad_norm": 1.1045622023879396, "kl": 0.2012939453125, "learning_rate": 4.791411170547545e-06, "loss": 0.0304, "num_tokens": 38206491.0, "reward": 0.25371459126472473, "reward_std": 0.1408795565366745, "rewards/code_reward/mean": 0.15586301684379578, "rewards/code_reward/std": 0.2744084298610687, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 411.517578125, "completions/mean_terminated_length": 411.517578125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1504028648164727, "frac_reward_zero_std": 0.1875, "grad_norm": 0.2878900204578428, "kl": 0.0731201171875, "learning_rate": 4.785239893263017e-06, "loss": 0.0273, "num_tokens": 38698444.0, "reward": 0.2593565583229065, "reward_std": 0.145517036318779, "rewards/code_reward/mean": 0.16189561784267426, "rewards/code_reward/std": 0.27327021956443787, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 411.72265625, "completions/mean_terminated_length": 411.72265625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.1521933751119069, "frac_reward_zero_std": 0.09375, "grad_norm": 0.3239236981309592, "kl": 0.077392578125, "learning_rate": 4.778983221388742e-06, "loss": 0.0166, "num_tokens": 39204670.0, "reward": 0.2738696336746216, "reward_std": 0.17491155862808228, "rewards/code_reward/mean": 0.17621338367462158, "rewards/code_reward/std": 0.2899892330169678, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 415.484375, "completions/mean_terminated_length": 412.28961181640625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1539838854073411, "frac_reward_zero_std": 0.21875, "grad_norm": 0.29597592930179795, "kl": 0.075439453125, "learning_rate": 4.77264141744214e-06, "loss": 0.0257, "num_tokens": 39722870.0, "reward": 0.28541022539138794, "reward_std": 0.13731083273887634, "rewards/code_reward/mean": 0.18775397539138794, "rewards/code_reward/std": 0.3113518953323364, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 434.08984375, "completions/mean_terminated_length": 434.08984375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1557743957027753, "frac_reward_zero_std": 0.234375, "grad_norm": 0.2777846821358406, "kl": 0.0703125, "learning_rate": 4.766214747512603e-06, "loss": 0.0245, "num_tokens": 40265252.0, "reward": 0.22712832689285278, "reward_std": 0.13768994808197021, "rewards/code_reward/mean": 0.1288861483335495, "rewards/code_reward/std": 0.24623318016529083, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 420.73046875, "completions/mean_terminated_length": 420.73046875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1575649059982095, "frac_reward_zero_std": 0.265625, "grad_norm": 0.29167110950193414, "kl": 0.0726318359375, "learning_rate": 4.759703481250331e-06, "loss": 0.0342, "num_tokens": 40784514.0, "reward": 0.20892399549484253, "reward_std": 0.10321653634309769, "rewards/code_reward/mean": 0.11107243597507477, "rewards/code_reward/std": 0.22629529237747192, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 407.298828125, "completions/mean_terminated_length": 407.298828125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1593554162936437, "frac_reward_zero_std": 0.140625, "grad_norm": 0.32007746239539625, "kl": 0.074951171875, "learning_rate": 4.753107891855015e-06, "loss": 0.0149, "num_tokens": 41297315.0, "reward": 0.2365364134311676, "reward_std": 0.14443057775497437, "rewards/code_reward/mean": 0.1398567259311676, "rewards/code_reward/std": 0.2459539920091629, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 406.943359375, "completions/mean_terminated_length": 406.943359375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1611459265890779, "frac_reward_zero_std": 0.1875, "grad_norm": 0.29130371425852897, "kl": 0.074462890625, "learning_rate": 4.746428256064375e-06, "loss": 0.0284, "num_tokens": 41821670.0, "reward": 0.2874128222465515, "reward_std": 0.17291733622550964, "rewards/code_reward/mean": 0.19014719128608704, "rewards/code_reward/std": 0.31551653146743774, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 404.560546875, "completions/mean_terminated_length": 401.34442138671875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1629364368845121, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3056983675541567, "kl": 0.0697021484375, "learning_rate": 4.7396648541425534e-06, "loss": 0.0251, "num_tokens": 42311853.0, "reward": 0.3292792737483978, "reward_std": 0.16662824153900146, "rewards/code_reward/mean": 0.23025581240653992, "rewards/code_reward/std": 0.3556465208530426, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 396.173828125, "completions/mean_terminated_length": 391.5196228027344, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1647269471799463, "frac_reward_zero_std": 0.203125, "grad_norm": 54.944093751372755, "kl": 12.9307861328125, "learning_rate": 4.732817969868348e-06, "loss": 0.1534, "num_tokens": 42815478.0, "reward": 0.2822774648666382, "reward_std": 0.15155954658985138, "rewards/code_reward/mean": 0.183644637465477, "rewards/code_reward/std": 0.29826027154922485, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 419.544921875, "completions/mean_terminated_length": 416.3581237792969, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.1665174574753805, "frac_reward_zero_std": 0.25, "grad_norm": 0.29281377292208655, "kl": 0.0743408203125, "learning_rate": 4.7258878905233095e-06, "loss": 0.0389, "num_tokens": 43359893.0, "reward": 0.2275458127260208, "reward_std": 0.12970243394374847, "rewards/code_reward/mean": 0.12910830974578857, "rewards/code_reward/std": 0.26821738481521606, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 415.6484375, "completions/mean_terminated_length": 415.6484375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1683079677708147, "frac_reward_zero_std": 0.21875, "grad_norm": 0.29086594708708, "kl": 0.0714111328125, "learning_rate": 4.718874906879688e-06, "loss": 0.0165, "num_tokens": 43880017.0, "reward": 0.21482887864112854, "reward_std": 0.1392717808485031, "rewards/code_reward/mean": 0.11619605123996735, "rewards/code_reward/std": 0.2687281370162964, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 411.98046875, "completions/mean_terminated_length": 411.98046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1700984780662489, "frac_reward_zero_std": 0.1875, "grad_norm": 0.30674807507597945, "kl": 0.0711669921875, "learning_rate": 4.711779313188231e-06, "loss": 0.0094, "num_tokens": 44390287.0, "reward": 0.2781375050544739, "reward_std": 0.1417272388935089, "rewards/code_reward/mean": 0.1789187341928482, "rewards/code_reward/std": 0.30972954630851746, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 407.19140625, "completions/mean_terminated_length": 407.19140625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1718889883616831, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3095420736126796, "kl": 0.0791015625, "learning_rate": 4.70460140716584e-06, "loss": 0.0034, "num_tokens": 44901489.0, "reward": 0.23000526428222656, "reward_std": 0.13413795828819275, "rewards/code_reward/mean": 0.13215371966362, "rewards/code_reward/std": 0.24548418819904327, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 397.453125, "completions/mean_terminated_length": 397.453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.17367949865711726, "frac_reward_zero_std": 0.296875, "grad_norm": 0.2706452078925795, "kl": 0.0772705078125, "learning_rate": 4.697341489983076e-06, "loss": 0.012, "num_tokens": 45432769.0, "reward": 0.24562129378318787, "reward_std": 0.14501884579658508, "rewards/code_reward/mean": 0.1467931866645813, "rewards/code_reward/std": 0.2729908525943756, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 401.671875, "completions/mean_terminated_length": 401.671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.17547000895255147, "frac_reward_zero_std": 0.296875, "grad_norm": 0.2725810032243102, "kl": 0.072998046875, "learning_rate": 4.6899998662515215e-06, "loss": 0.0059, "num_tokens": 45973481.0, "reward": 0.2680072784423828, "reward_std": 0.14313969016075134, "rewards/code_reward/mean": 0.1689838320016861, "rewards/code_reward/std": 0.30308088660240173, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 396.35546875, "completions/mean_terminated_length": 396.35546875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.17726051924798567, "frac_reward_zero_std": 0.265625, "grad_norm": 0.28954153484237305, "kl": 0.07080078125, "learning_rate": 4.682576844011007e-06, "loss": 0.0245, "num_tokens": 46504975.0, "reward": 0.23878340423107147, "reward_std": 0.13447996973991394, "rewards/code_reward/mean": 0.139564648270607, "rewards/code_reward/std": 0.25893452763557434, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 377.82421875, "completions/mean_terminated_length": 377.82421875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.17905102954341987, "frac_reward_zero_std": 0.171875, "grad_norm": 0.31627233906729824, "kl": 0.0714111328125, "learning_rate": 4.675072734716678e-06, "loss": 0.0115, "num_tokens": 46984837.0, "reward": 0.26939547061920166, "reward_std": 0.15329280495643616, "rewards/code_reward/mean": 0.17017671465873718, "rewards/code_reward/std": 0.30700647830963135, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 363.138671875, "completions/mean_terminated_length": 363.138671875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.18084153983885407, "frac_reward_zero_std": 0.1875, "grad_norm": 0.32250713655679614, "kl": 0.068603515625, "learning_rate": 4.667487853225931e-06, "loss": 0.0196, "num_tokens": 47477780.0, "reward": 0.2528887987136841, "reward_std": 0.12872125208377838, "rewards/code_reward/mean": 0.15386536717414856, "rewards/code_reward/std": 0.2818002998828888, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 352.603515625, "completions/mean_terminated_length": 352.603515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.18263205013428827, "frac_reward_zero_std": 0.203125, "grad_norm": 0.31328492538996805, "kl": 0.0704345703125, "learning_rate": 4.659822517785203e-06, "loss": 0.0016, "num_tokens": 47961609.0, "reward": 0.26954489946365356, "reward_std": 0.1535976380109787, "rewards/code_reward/mean": 0.17052146792411804, "rewards/code_reward/std": 0.3008057475090027, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 328.923828125, "completions/mean_terminated_length": 328.4187927246094, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.18442256042972247, "frac_reward_zero_std": 0.296875, "grad_norm": 0.822063694299378, "kl": 0.0767822265625, "learning_rate": 4.6520770500166165e-06, "loss": 0.0065, "num_tokens": 48436130.0, "reward": 0.25916385650634766, "reward_std": 0.12211017310619354, "rewards/code_reward/mean": 0.1593591868877411, "rewards/code_reward/std": 0.27345800399780273, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 347.5, "completions/mean_terminated_length": 347.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.18621307072515667, "frac_reward_zero_std": 0.21875, "grad_norm": 0.312990857975619, "kl": 0.0704345703125, "learning_rate": 4.644251774904487e-06, "loss": 0.0158, "num_tokens": 48924866.0, "reward": 0.259175181388855, "reward_std": 0.1552465707063675, "rewards/code_reward/mean": 0.16112829744815826, "rewards/code_reward/std": 0.28096309304237366, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 331.873046875, "completions/mean_terminated_length": 331.873046875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.18800358102059087, "frac_reward_zero_std": 0.40625, "grad_norm": 0.30354157143059635, "kl": 0.09033203125, "learning_rate": 4.636347020781684e-06, "loss": 0.026, "num_tokens": 49408641.0, "reward": 0.22020795941352844, "reward_std": 0.10169152915477753, "rewards/code_reward/mean": 0.12059856951236725, "rewards/code_reward/std": 0.24559232592582703, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 312.5078125, "completions/mean_terminated_length": 312.5078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.18979409131602507, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3276691189366287, "kl": 0.076171875, "learning_rate": 4.6283631193158605e-06, "loss": 0.0218, "num_tokens": 49892741.0, "reward": 0.26345115900039673, "reward_std": 0.12912996113300323, "rewards/code_reward/mean": 0.1650136411190033, "rewards/code_reward/std": 0.2715831398963928, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 303.814453125, "completions/mean_terminated_length": 303.814453125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.19158460161145927, "frac_reward_zero_std": 0.21875, "grad_norm": 0.35116546617659244, "kl": 0.0765380859375, "learning_rate": 4.620300405495532e-06, "loss": 0.0212, "num_tokens": 50336414.0, "reward": 0.3703755736351013, "reward_std": 0.14746375381946564, "rewards/code_reward/mean": 0.2719380557537079, "rewards/code_reward/std": 0.3724975883960724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 318.162109375, "completions/mean_terminated_length": 318.162109375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.19337511190689347, "frac_reward_zero_std": 0.25, "grad_norm": 0.32949394912140384, "kl": 0.079833984375, "learning_rate": 4.612159217616022e-06, "loss": -0.0026, "num_tokens": 50818337.0, "reward": 0.25186407566070557, "reward_std": 0.13736267387866974, "rewards/code_reward/mean": 0.15342652797698975, "rewards/code_reward/std": 0.28991928696632385, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 300.23828125, "completions/mean_terminated_length": 296.8179931640625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.19516562220232767, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3466372914287762, "kl": 0.079345703125, "learning_rate": 4.603939897265268e-06, "loss": 0.0182, "num_tokens": 51296659.0, "reward": 0.2714746594429016, "reward_std": 0.15743209421634674, "rewards/code_reward/mean": 0.17264652252197266, "rewards/code_reward/std": 0.28297537565231323, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 296.5390625, "completions/mean_terminated_length": 295.8199462890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.19695613249776187, "frac_reward_zero_std": 0.28125, "grad_norm": 0.32005447762215933, "kl": 0.076416015625, "learning_rate": 4.595642789309492e-06, "loss": 0.0095, "num_tokens": 51736471.0, "reward": 0.26083555817604065, "reward_std": 0.13127371668815613, "rewards/code_reward/mean": 0.16161680221557617, "rewards/code_reward/std": 0.28814712166786194, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 292.033203125, "completions/mean_terminated_length": 292.033203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.19874664279319607, "frac_reward_zero_std": 0.375, "grad_norm": 0.3000594284911877, "kl": 0.08349609375, "learning_rate": 4.587268241878724e-06, "loss": 0.019, "num_tokens": 52186248.0, "reward": 0.23488736152648926, "reward_std": 0.09540726244449615, "rewards/code_reward/mean": 0.13684049248695374, "rewards/code_reward/std": 0.27229395508766174, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 294.51953125, "completions/mean_terminated_length": 294.51953125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.20053715308863027, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3534838781648574, "kl": 0.0919189453125, "learning_rate": 4.578816606352205e-06, "loss": 0.0274, "num_tokens": 52662154.0, "reward": 0.2588709592819214, "reward_std": 0.13111022114753723, "rewards/code_reward/mean": 0.16082406044006348, "rewards/code_reward/std": 0.27863022685050964, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 292.15625, "completions/mean_terminated_length": 292.15625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.20232766338406447, "frac_reward_zero_std": 0.265625, "grad_norm": 0.33609345259653484, "kl": 0.0782470703125, "learning_rate": 4.570288237343632e-06, "loss": 0.0215, "num_tokens": 53127114.0, "reward": 0.27459651231765747, "reward_std": 0.1417437046766281, "rewards/code_reward/mean": 0.176354318857193, "rewards/code_reward/std": 0.3142363727092743, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 288.77734375, "completions/mean_terminated_length": 288.77734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.20411817367949867, "frac_reward_zero_std": 0.25, "grad_norm": 0.32214419659805293, "kl": 0.080322265625, "learning_rate": 4.561683492686289e-06, "loss": 0.0312, "num_tokens": 53556680.0, "reward": 0.3147793412208557, "reward_std": 0.17243613302707672, "rewards/code_reward/mean": 0.21653714776039124, "rewards/code_reward/std": 0.33589228987693787, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 274.591796875, "completions/mean_terminated_length": 274.591796875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.20590868397493287, "frac_reward_zero_std": 0.203125, "grad_norm": 0.36672549435017615, "kl": 0.0867919921875, "learning_rate": 4.5530027334180285e-06, "loss": 0.0295, "num_tokens": 53985007.0, "reward": 0.25981223583221436, "reward_std": 0.14873994886875153, "rewards/code_reward/mean": 0.16235128045082092, "rewards/code_reward/std": 0.27079546451568604, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 264.1953125, "completions/mean_terminated_length": 264.1953125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.20769919427036707, "frac_reward_zero_std": 0.1875, "grad_norm": 0.36888006391525435, "kl": 0.0843505859375, "learning_rate": 4.544246323766122e-06, "loss": 0.0437, "num_tokens": 54438907.0, "reward": 0.25207504630088806, "reward_std": 0.14301323890686035, "rewards/code_reward/mean": 0.15402816236019135, "rewards/code_reward/std": 0.26615646481513977, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 261.66796875, "completions/mean_terminated_length": 261.66796875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.20948970456580127, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3493012274997051, "kl": 0.0916748046875, "learning_rate": 4.535414631131983e-06, "loss": 0.0204, "num_tokens": 54881713.0, "reward": 0.2882494330406189, "reward_std": 0.13065636157989502, "rewards/code_reward/mean": 0.19000723958015442, "rewards/code_reward/std": 0.32178938388824463, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 257.193359375, "completions/mean_terminated_length": 257.193359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.21128021486123547, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3469584890086513, "kl": 0.0877685546875, "learning_rate": 4.526508026075746e-06, "loss": 0.0151, "num_tokens": 55315396.0, "reward": 0.270325243473053, "reward_std": 0.12790974974632263, "rewards/code_reward/mean": 0.17188775539398193, "rewards/code_reward/std": 0.2993796765804291, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 257.06640625, "completions/mean_terminated_length": 257.06640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.21307072515666964, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3402809639020275, "kl": 0.0928955078125, "learning_rate": 4.517526882300721e-06, "loss": 0.0167, "num_tokens": 55736222.0, "reward": 0.2703761160373688, "reward_std": 0.10796543210744858, "rewards/code_reward/mean": 0.17135266959667206, "rewards/code_reward/std": 0.2858830690383911, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 264.041015625, "completions/mean_terminated_length": 260.5498962402344, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.21486123545210384, "frac_reward_zero_std": 0.3125, "grad_norm": 0.31711490285302923, "kl": 0.0897216796875, "learning_rate": 4.508471576637713e-06, "loss": 0.026, "num_tokens": 56177875.0, "reward": 0.2722854018211365, "reward_std": 0.1381382942199707, "rewards/code_reward/mean": 0.17326194047927856, "rewards/code_reward/std": 0.31433185935020447, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 256.197265625, "completions/mean_terminated_length": 256.197265625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.21665174574753804, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3375132988699812, "kl": 0.08740234375, "learning_rate": 4.499342489029211e-06, "loss": 0.0192, "num_tokens": 56618296.0, "reward": 0.312713623046875, "reward_std": 0.14304915070533752, "rewards/code_reward/mean": 0.21349486708641052, "rewards/code_reward/std": 0.32038965821266174, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 253.599609375, "completions/mean_terminated_length": 253.599609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.21844225604297224, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3368015132064829, "kl": 0.0831298828125, "learning_rate": 4.490140002513449e-06, "loss": 0.0195, "num_tokens": 57069323.0, "reward": 0.27316680550575256, "reward_std": 0.16733002662658691, "rewards/code_reward/mean": 0.1733621060848236, "rewards/code_reward/std": 0.30614954233169556, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 263.888671875, "completions/mean_terminated_length": 263.888671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.22023276633840644, "frac_reward_zero_std": 0.28125, "grad_norm": 0.34518587384641464, "kl": 0.0909423828125, "learning_rate": 4.48086450320833e-06, "loss": 0.0061, "num_tokens": 57509370.0, "reward": 0.34867969155311584, "reward_std": 0.13500499725341797, "rewards/code_reward/mean": 0.24907030165195465, "rewards/code_reward/std": 0.3494661748409271, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 264.9765625, "completions/mean_terminated_length": 264.9765625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.22202327663384064, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3533225292456153, "kl": 0.0894775390625, "learning_rate": 4.4715163802952266e-06, "loss": 0.0082, "num_tokens": 57955406.0, "reward": 0.25632357597351074, "reward_std": 0.15376761555671692, "rewards/code_reward/mean": 0.15671420097351074, "rewards/code_reward/std": 0.28637081384658813, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 271.1484375, "completions/mean_terminated_length": 271.1484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.22381378692927484, "frac_reward_zero_std": 0.1875, "grad_norm": 0.36100705749379314, "kl": 0.092041015625, "learning_rate": 4.462096026002655e-06, "loss": 0.0224, "num_tokens": 58404378.0, "reward": 0.28531506657600403, "reward_std": 0.11426351219415665, "rewards/code_reward/mean": 0.18590101599693298, "rewards/code_reward/std": 0.2740923762321472, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 272.396484375, "completions/mean_terminated_length": 272.396484375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.22560429722470904, "frac_reward_zero_std": 0.265625, "grad_norm": 0.34388591211430536, "kl": 0.088623046875, "learning_rate": 4.4526038355898144e-06, "loss": 0.0265, "num_tokens": 58864757.0, "reward": 0.27889060974121094, "reward_std": 0.14235535264015198, "rewards/code_reward/mean": 0.17967185378074646, "rewards/code_reward/std": 0.27678149938583374, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 259.345703125, "completions/mean_terminated_length": 259.345703125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.22739480752014324, "frac_reward_zero_std": 0.28125, "grad_norm": 0.34629547410425565, "kl": 0.0826416015625, "learning_rate": 4.4430402073300035e-06, "loss": 0.0335, "num_tokens": 59294790.0, "reward": 0.26384031772613525, "reward_std": 0.14073538780212402, "rewards/code_reward/mean": 0.16462156176567078, "rewards/code_reward/std": 0.28662827610969543, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 258.755859375, "completions/mean_terminated_length": 258.755859375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.22918531781557744, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3593357677596667, "kl": 0.0877685546875, "learning_rate": 4.433405542493909e-06, "loss": -0.0085, "num_tokens": 59731889.0, "reward": 0.27691012620925903, "reward_std": 0.1281612068414688, "rewards/code_reward/mean": 0.17730073630809784, "rewards/code_reward/std": 0.31043851375579834, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 280.978515625, "completions/mean_terminated_length": 280.978515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.23097582811101164, "frac_reward_zero_std": 0.265625, "grad_norm": 0.32895471831299544, "kl": 0.08251953125, "learning_rate": 4.4237002453327734e-06, "loss": 0.0073, "num_tokens": 60192142.0, "reward": 0.29392462968826294, "reward_std": 0.1577589213848114, "rewards/code_reward/mean": 0.19411993026733398, "rewards/code_reward/std": 0.3232797384262085, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 276.83984375, "completions/mean_terminated_length": 276.83984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.23276633840644584, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3507318132989103, "kl": 0.0845947265625, "learning_rate": 4.4139247230614245e-06, "loss": -0.0047, "num_tokens": 60642420.0, "reward": 0.2884120047092438, "reward_std": 0.1675199717283249, "rewards/code_reward/mean": 0.18899792432785034, "rewards/code_reward/std": 0.31030187010765076, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 278.59375, "completions/mean_terminated_length": 278.123291015625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.23455684870188004, "frac_reward_zero_std": 0.359375, "grad_norm": 766.7086675408019, "kl": 288.0589599609375, "learning_rate": 4.404079385841201e-06, "loss": 2.8964, "num_tokens": 61096516.0, "reward": 0.25187644362449646, "reward_std": 0.12037205696105957, "rewards/code_reward/mean": 0.1520717293024063, "rewards/code_reward/std": 0.2902584969997406, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 295.330078125, "completions/mean_terminated_length": 295.330078125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.23634735899731424, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3322685510194651, "kl": 0.0859375, "learning_rate": 4.394164646762734e-06, "loss": 0.0173, "num_tokens": 61597037.0, "reward": 0.2732861042022705, "reward_std": 0.15091478824615479, "rewards/code_reward/mean": 0.17387202382087708, "rewards/code_reward/std": 0.2873980700969696, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 301.1328125, "completions/mean_terminated_length": 301.1328125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.23813786929274844, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3354559104082405, "kl": 0.089599609375, "learning_rate": 4.384180921828618e-06, "loss": -0.0051, "num_tokens": 62052481.0, "reward": 0.2619659900665283, "reward_std": 0.13542994856834412, "rewards/code_reward/mean": 0.16274723410606384, "rewards/code_reward/std": 0.270715594291687, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 308.494140625, "completions/mean_terminated_length": 308.494140625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.23992837958818264, "frac_reward_zero_std": 0.25, "grad_norm": 0.31357202251404237, "kl": 0.081298828125, "learning_rate": 4.374128629935955e-06, "loss": 0.0117, "num_tokens": 62505558.0, "reward": 0.2640910744667053, "reward_std": 0.11918876320123672, "rewards/code_reward/mean": 0.1640910506248474, "rewards/code_reward/std": 0.2883664071559906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 322.427734375, "completions/mean_terminated_length": 322.427734375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.24171888988361684, "frac_reward_zero_std": 0.234375, "grad_norm": 0.32384608333562476, "kl": 0.076904296875, "learning_rate": 4.364008192858781e-06, "loss": 0.0002, "num_tokens": 62973441.0, "reward": 0.34225231409072876, "reward_std": 0.15532821416854858, "rewards/code_reward/mean": 0.24225232005119324, "rewards/code_reward/std": 0.3316134810447693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 337.404296875, "completions/mean_terminated_length": 337.404296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.24350940017905104, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3267735161185479, "kl": 0.083984375, "learning_rate": 4.353820035230366e-06, "loss": 0.0142, "num_tokens": 63453760.0, "reward": 0.2892310619354248, "reward_std": 0.16023030877113342, "rewards/code_reward/mean": 0.1902076005935669, "rewards/code_reward/std": 0.3095491826534271, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 328.95703125, "completions/mean_terminated_length": 328.95703125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.24529991047448524, "frac_reward_zero_std": 0.265625, "grad_norm": 0.34001051090544127, "kl": 0.0936279296875, "learning_rate": 4.3435645845254e-06, "loss": 0.0265, "num_tokens": 63946802.0, "reward": 0.2441038191318512, "reward_std": 0.09911616146564484, "rewards/code_reward/mean": 0.14449442923069, "rewards/code_reward/std": 0.2648237943649292, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 329.267578125, "completions/mean_terminated_length": 329.267578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.24709042076991944, "frac_reward_zero_std": 0.265625, "grad_norm": 0.30488042709827, "kl": 0.0789794921875, "learning_rate": 4.333242271042054e-06, "loss": 0.0142, "num_tokens": 64455771.0, "reward": 0.261479914188385, "reward_std": 0.1527477502822876, "rewards/code_reward/mean": 0.16206586360931396, "rewards/code_reward/std": 0.29031750559806824, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 338.220703125, "completions/mean_terminated_length": 338.220703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.24888093106535364, "frac_reward_zero_std": 0.28125, "grad_norm": 0.31212608273231324, "kl": 0.0731201171875, "learning_rate": 4.32285352788393e-06, "loss": 0.0186, "num_tokens": 64927052.0, "reward": 0.27985087037086487, "reward_std": 0.14789444208145142, "rewards/code_reward/mean": 0.17985087633132935, "rewards/code_reward/std": 0.3142760694026947, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 326.392578125, "completions/mean_terminated_length": 326.392578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.25067144136078784, "frac_reward_zero_std": 0.203125, "grad_norm": 0.32035740876619356, "kl": 0.080810546875, "learning_rate": 4.312398790941882e-06, "loss": 0.0006, "num_tokens": 65377141.0, "reward": 0.27668333053588867, "reward_std": 0.15888220071792603, "rewards/code_reward/mean": 0.17726927995681763, "rewards/code_reward/std": 0.27567827701568604, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 332.6484375, "completions/mean_terminated_length": 332.6484375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.25246195165622204, "frac_reward_zero_std": 0.25, "grad_norm": 0.3393455634734922, "kl": 0.0869140625, "learning_rate": 4.301878498875735e-06, "loss": 0.028, "num_tokens": 65887257.0, "reward": 0.3101876974105835, "reward_std": 0.14927417039871216, "rewards/code_reward/mean": 0.21116423606872559, "rewards/code_reward/std": 0.32989010214805603, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 322.654296875, "completions/mean_terminated_length": 322.654296875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.25425246195165624, "frac_reward_zero_std": 0.359375, "grad_norm": 0.30075200247430256, "kl": 0.08935546875, "learning_rate": 4.291293093095873e-06, "loss": 0.0238, "num_tokens": 66384392.0, "reward": 0.31323903799057007, "reward_std": 0.12605983018875122, "rewards/code_reward/mean": 0.21362966299057007, "rewards/code_reward/std": 0.3290594220161438, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 313.974609375, "completions/mean_terminated_length": 313.974609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.25604297224709044, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3413982656794119, "kl": 0.09228515625, "learning_rate": 4.280643017744723e-06, "loss": 0.0133, "num_tokens": 66827675.0, "reward": 0.2857291102409363, "reward_std": 0.14997056126594543, "rewards/code_reward/mean": 0.18611976504325867, "rewards/code_reward/std": 0.3021654784679413, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 313.626953125, "completions/mean_terminated_length": 313.626953125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.25783348254252464, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3418704390722312, "kl": 0.0875244140625, "learning_rate": 4.269928719678117e-06, "loss": 0.0149, "num_tokens": 67302396.0, "reward": 0.2426859438419342, "reward_std": 0.13458314538002014, "rewards/code_reward/mean": 0.14346718788146973, "rewards/code_reward/std": 0.27361878752708435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 316.88671875, "completions/mean_terminated_length": 316.88671875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.25962399283795884, "frac_reward_zero_std": 0.234375, "grad_norm": 0.35451145407563023, "kl": 0.08837890625, "learning_rate": 4.2591506484465426e-06, "loss": 0.0032, "num_tokens": 67755050.0, "reward": 0.3200894892215729, "reward_std": 0.14264976978302002, "rewards/code_reward/mean": 0.2208707332611084, "rewards/code_reward/std": 0.3169369697570801, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 316.267578125, "completions/mean_terminated_length": 316.267578125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.26141450313339304, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3124983836701843, "kl": 0.0836181640625, "learning_rate": 4.248309256276283e-06, "loss": 0.0164, "num_tokens": 68234451.0, "reward": 0.27754729986190796, "reward_std": 0.14490430057048798, "rewards/code_reward/mean": 0.17754730582237244, "rewards/code_reward/std": 0.29222720861434937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 307.236328125, "completions/mean_terminated_length": 307.236328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.26320501342882724, "frac_reward_zero_std": 0.25, "grad_norm": 0.32210720213361593, "kl": 0.0948486328125, "learning_rate": 4.23740499805044e-06, "loss": 0.0085, "num_tokens": 68727844.0, "reward": 0.24946418404579163, "reward_std": 0.12793749570846558, "rewards/code_reward/mean": 0.15024542808532715, "rewards/code_reward/std": 0.25524136424064636, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 314.38671875, "completions/mean_terminated_length": 314.38671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.26499552372426144, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3283518627718804, "kl": 0.0850830078125, "learning_rate": 4.22643833128985e-06, "loss": 0.0071, "num_tokens": 69211386.0, "reward": 0.22055652737617493, "reward_std": 0.10302282869815826, "rewards/code_reward/mean": 0.12153308838605881, "rewards/code_reward/std": 0.22413299977779388, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 319.22265625, "completions/mean_terminated_length": 319.22265625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.26678603401969564, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3341701370399261, "kl": 0.0943603515625, "learning_rate": 4.215409716133885e-06, "loss": 0.0254, "num_tokens": 69691660.0, "reward": 0.30055397748947144, "reward_std": 0.14153146743774414, "rewards/code_reward/mean": 0.20192116498947144, "rewards/code_reward/std": 0.2893611788749695, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 303.5625, "completions/mean_terminated_length": 303.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.26857654431512984, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3474122880977322, "kl": 0.0906982421875, "learning_rate": 4.204319615321151e-06, "loss": 0.0369, "num_tokens": 70159292.0, "reward": 0.3196170926094055, "reward_std": 0.15182232856750488, "rewards/code_reward/mean": 0.22117958962917328, "rewards/code_reward/std": 0.30583736300468445, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 289.888671875, "completions/mean_terminated_length": 289.888671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.270367054610564, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3350756301541624, "kl": 0.0936279296875, "learning_rate": 4.193168494170065e-06, "loss": -0.0042, "num_tokens": 70632219.0, "reward": 0.347265362739563, "reward_std": 0.1525212526321411, "rewards/code_reward/mean": 0.2476559430360794, "rewards/code_reward/std": 0.33187398314476013, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 287.611328125, "completions/mean_terminated_length": 287.611328125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2721575649059982, "frac_reward_zero_std": 0.25, "grad_norm": 0.3541767942276267, "kl": 0.0877685546875, "learning_rate": 4.181956820559339e-06, "loss": 0.0044, "num_tokens": 71090756.0, "reward": 0.3067777156829834, "reward_std": 0.15955448150634766, "rewards/code_reward/mean": 0.2071683555841446, "rewards/code_reward/std": 0.3137202858924866, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 311.171875, "completions/mean_terminated_length": 311.171875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.2739480752014324, "frac_reward_zero_std": 0.25, "grad_norm": 0.3376160672517025, "kl": 0.091796875, "learning_rate": 4.170685064908342e-06, "loss": 0.0127, "num_tokens": 71573028.0, "reward": 0.23661844432353973, "reward_std": 0.12491189688444138, "rewards/code_reward/mean": 0.13739967346191406, "rewards/code_reward/std": 0.2552027106285095, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 295.4609375, "completions/mean_terminated_length": 295.4609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2757385854968666, "frac_reward_zero_std": 0.296875, "grad_norm": 0.33277209587291007, "kl": 0.0889892578125, "learning_rate": 4.159353700157365e-06, "loss": -0.002, "num_tokens": 72036920.0, "reward": 0.2636808454990387, "reward_std": 0.11535799503326416, "rewards/code_reward/mean": 0.16446208953857422, "rewards/code_reward/std": 0.2822232246398926, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 304.326171875, "completions/mean_terminated_length": 304.326171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2775290957923008, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3261167110842557, "kl": 0.0860595703125, "learning_rate": 4.14796320174778e-06, "loss": 0.0189, "num_tokens": 72506887.0, "reward": 0.29393240809440613, "reward_std": 0.14501366019248962, "rewards/code_reward/mean": 0.19432303309440613, "rewards/code_reward/std": 0.3189610242843628, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 303.283203125, "completions/mean_terminated_length": 303.283203125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.279319606087735, "frac_reward_zero_std": 0.234375, "grad_norm": 0.32523434877481816, "kl": 0.091064453125, "learning_rate": 4.136514047602087e-06, "loss": 0.0163, "num_tokens": 72943176.0, "reward": 0.3288341760635376, "reward_std": 0.1549176275730133, "rewards/code_reward/mean": 0.22961537539958954, "rewards/code_reward/std": 0.3439922630786896, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 308.77734375, "completions/mean_terminated_length": 308.77734375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2811101163831692, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3376742947588228, "kl": 0.089599609375, "learning_rate": 4.1250067181038635e-06, "loss": -0.011, "num_tokens": 73429294.0, "reward": 0.2583348751068115, "reward_std": 0.13448381423950195, "rewards/code_reward/mean": 0.159311443567276, "rewards/code_reward/std": 0.2972154915332794, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 304.134765625, "completions/mean_terminated_length": 300.72210693359375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.2829006266786034, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3467537385973916, "kl": 0.08935546875, "learning_rate": 4.113441696077608e-06, "loss": 0.0459, "num_tokens": 73896835.0, "reward": 0.2911016345024109, "reward_std": 0.1565992534160614, "rewards/code_reward/mean": 0.19188286364078522, "rewards/code_reward/std": 0.3194141089916229, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 307.626953125, "completions/mean_terminated_length": 307.626953125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2846911369740376, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3042480928117257, "kl": 0.0892333984375, "learning_rate": 4.101819466768484e-06, "loss": 0.0213, "num_tokens": 74392780.0, "reward": 0.3241274058818817, "reward_std": 0.1580726057291031, "rewards/code_reward/mean": 0.2245180308818817, "rewards/code_reward/std": 0.32076361775398254, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 300.556640625, "completions/mean_terminated_length": 300.556640625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2864816472694718, "frac_reward_zero_std": 0.3125, "grad_norm": 0.31802806771671355, "kl": 0.092529296875, "learning_rate": 4.0901405178219535e-06, "loss": 0.0028, "num_tokens": 74829457.0, "reward": 0.2814684808254242, "reward_std": 0.12700793147087097, "rewards/code_reward/mean": 0.18205440044403076, "rewards/code_reward/std": 0.2906831204891205, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 318.6484375, "completions/mean_terminated_length": 318.6484375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.288272157564906, "frac_reward_zero_std": 0.234375, "grad_norm": 0.33462330289330167, "kl": 0.0911865234375, "learning_rate": 4.078405339263326e-06, "loss": 0.0172, "num_tokens": 75303749.0, "reward": 0.33636578917503357, "reward_std": 0.15007105469703674, "rewards/code_reward/mean": 0.23773297667503357, "rewards/code_reward/std": 0.3330623209476471, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 320.080078125, "completions/mean_terminated_length": 320.080078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.2900626678603402, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3277907576973315, "kl": 0.087890625, "learning_rate": 4.06661442347719e-06, "loss": 0.0113, "num_tokens": 75784550.0, "reward": 0.30116409063339233, "reward_std": 0.15441249310970306, "rewards/code_reward/mean": 0.20135939121246338, "rewards/code_reward/std": 0.3244519829750061, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 341.98828125, "completions/mean_terminated_length": 338.6496887207031, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2918531781557744, "frac_reward_zero_std": 0.15625, "grad_norm": 0.33722036922836335, "kl": 0.0931396484375, "learning_rate": 4.054768265186758e-06, "loss": 0.0319, "num_tokens": 76277384.0, "reward": 0.29440170526504517, "reward_std": 0.13635772466659546, "rewards/code_reward/mean": 0.1955735981464386, "rewards/code_reward/std": 0.29770615696907043, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 327.5390625, "completions/mean_terminated_length": 324.1722106933594, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2936436884512086, "frac_reward_zero_std": 0.21875, "grad_norm": 0.326133699148681, "kl": 0.0955810546875, "learning_rate": 4.0428673614331036e-06, "loss": 0.0055, "num_tokens": 76723876.0, "reward": 0.2997177839279175, "reward_std": 0.1388765275478363, "rewards/code_reward/mean": 0.20030370354652405, "rewards/code_reward/std": 0.2888404130935669, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 323.4921875, "completions/mean_terminated_length": 322.4853210449219, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.2954341987466428, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3594103550242623, "kl": 0.11572265625, "learning_rate": 4.030912211554316e-06, "loss": 0.0143, "num_tokens": 77193344.0, "reward": 0.33038508892059326, "reward_std": 0.13573871552944183, "rewards/code_reward/mean": 0.2311663180589676, "rewards/code_reward/std": 0.3381703495979309, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 335.734375, "completions/mean_terminated_length": 335.734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.297224709042077, "frac_reward_zero_std": 0.265625, "grad_norm": 0.30551544342583403, "kl": 0.088134765625, "learning_rate": 4.018903317164539e-06, "loss": 0.0273, "num_tokens": 77688864.0, "reward": 0.2815166711807251, "reward_std": 0.1665625274181366, "rewards/code_reward/mean": 0.18171197175979614, "rewards/code_reward/std": 0.29258760809898376, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 359.3515625, "completions/mean_terminated_length": 359.3515625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2990152193375112, "frac_reward_zero_std": 0.265625, "grad_norm": 0.29987866497370375, "kl": 0.0887451171875, "learning_rate": 4.006841182132932e-06, "loss": -0.0023, "num_tokens": 78225660.0, "reward": 0.2571454644203186, "reward_std": 0.13338415324687958, "rewards/code_reward/mean": 0.15773138403892517, "rewards/code_reward/std": 0.26423293352127075, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 340.384765625, "completions/mean_terminated_length": 337.0430603027344, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3008057296329454, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3143252160329613, "kl": 0.08935546875, "learning_rate": 3.9947263125625195e-06, "loss": -0.0008, "num_tokens": 78688849.0, "reward": 0.37164345383644104, "reward_std": 0.13972212374210358, "rewards/code_reward/mean": 0.2728153467178345, "rewards/code_reward/std": 0.35561585426330566, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 329.333984375, "completions/mean_terminated_length": 329.333984375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3025962399283796, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3363661852548089, "kl": 0.100341796875, "learning_rate": 3.982559216768967e-06, "loss": 0.017, "num_tokens": 79152204.0, "reward": 0.28042012453079224, "reward_std": 0.14878323674201965, "rewards/code_reward/mean": 0.1823732554912567, "rewards/code_reward/std": 0.27264365553855896, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 356.900390625, "completions/mean_terminated_length": 353.59100341796875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.3043867502238138, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3097615787989714, "kl": 0.095703125, "learning_rate": 3.970340405259245e-06, "loss": 0.038, "num_tokens": 79655537.0, "reward": 0.2574957311153412, "reward_std": 0.10539855808019638, "rewards/code_reward/mean": 0.1588629186153412, "rewards/code_reward/std": 0.26342031359672546, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 345.970703125, "completions/mean_terminated_length": 345.970703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.306177260519248, "frac_reward_zero_std": 0.296875, "grad_norm": 0.29667292628025727, "kl": 0.0911865234375, "learning_rate": 3.958070390710214e-06, "loss": 0.011, "num_tokens": 80137370.0, "reward": 0.2823272943496704, "reward_std": 0.12240764498710632, "rewards/code_reward/mean": 0.1827179342508316, "rewards/code_reward/std": 0.28809747099876404, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 350.896484375, "completions/mean_terminated_length": 350.896484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3079677708146822, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3018906773051422, "kl": 0.0968017578125, "learning_rate": 3.945749687947109e-06, "loss": -0.0114, "num_tokens": 80621701.0, "reward": 0.32794445753097534, "reward_std": 0.1336970329284668, "rewards/code_reward/mean": 0.2291163206100464, "rewards/code_reward/std": 0.32761216163635254, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 367.1875, "completions/mean_terminated_length": 367.1875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.3097582811101164, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3219022992537172, "kl": 0.091796875, "learning_rate": 3.933378813921942e-06, "loss": 0.0048, "num_tokens": 81121101.0, "reward": 0.3231399953365326, "reward_std": 0.148738294839859, "rewards/code_reward/mean": 0.22411656379699707, "rewards/code_reward/std": 0.33152177929878235, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 385.92578125, "completions/mean_terminated_length": 385.92578125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3115487914055506, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3026065739611122, "kl": 0.09423828125, "learning_rate": 3.920958287691811e-06, "loss": 0.0058, "num_tokens": 81631863.0, "reward": 0.31303754448890686, "reward_std": 0.1397426724433899, "rewards/code_reward/mean": 0.21401408314704895, "rewards/code_reward/std": 0.2977985441684723, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 382.146484375, "completions/mean_terminated_length": 382.146484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3133393017009848, "frac_reward_zero_std": 0.265625, "grad_norm": 0.32224339184182627, "kl": 0.091552734375, "learning_rate": 3.908488630397121e-06, "loss": 0.0104, "num_tokens": 82118498.0, "reward": 0.29820454120635986, "reward_std": 0.13403087854385376, "rewards/code_reward/mean": 0.19937637448310852, "rewards/code_reward/std": 0.3047467768192291, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 397.390625, "completions/mean_terminated_length": 394.16046142578125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.315129811996419, "frac_reward_zero_std": 0.109375, "grad_norm": 0.31131413977903466, "kl": 0.088623046875, "learning_rate": 3.8959703652397175e-06, "loss": 0.0261, "num_tokens": 82631706.0, "reward": 0.3371277451515198, "reward_std": 0.2031174898147583, "rewards/code_reward/mean": 0.23810428380966187, "rewards/code_reward/std": 0.33652055263519287, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 384.7109375, "completions/mean_terminated_length": 384.7109375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.3169203222918532, "frac_reward_zero_std": 0.125, "grad_norm": 0.31659613515542256, "kl": 0.0919189453125, "learning_rate": 3.883404017460935e-06, "loss": 0.0277, "num_tokens": 83118558.0, "reward": 0.35138776898384094, "reward_std": 0.16873328387737274, "rewards/code_reward/mean": 0.25216901302337646, "rewards/code_reward/std": 0.3186894655227661, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 400.732421875, "completions/mean_terminated_length": 400.732421875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3187108325872874, "frac_reward_zero_std": 0.203125, "grad_norm": 0.2931388158858786, "kl": 0.085693359375, "learning_rate": 3.870790114319559e-06, "loss": 0.0223, "num_tokens": 83636709.0, "reward": 0.2939419150352478, "reward_std": 0.1368519514799118, "rewards/code_reward/mean": 0.1943325400352478, "rewards/code_reward/std": 0.28705325722694397, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 408.951171875, "completions/mean_terminated_length": 408.951171875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3205013428827216, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3133799502551747, "kl": 0.087890625, "learning_rate": 3.858129185069701e-06, "loss": 0.0265, "num_tokens": 84147252.0, "reward": 0.2713393568992615, "reward_std": 0.12705281376838684, "rewards/code_reward/mean": 0.17270652949810028, "rewards/code_reward/std": 0.29455018043518066, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 389.736328125, "completions/mean_terminated_length": 389.736328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.3222918531781558, "frac_reward_zero_std": 0.265625, "grad_norm": 0.2860620531777677, "kl": 0.0885009765625, "learning_rate": 3.845421760938597e-06, "loss": 0.0098, "num_tokens": 84649269.0, "reward": 0.33583176136016846, "reward_std": 0.1579369306564331, "rewards/code_reward/mean": 0.23680832982063293, "rewards/code_reward/std": 0.3424367904663086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 389.927734375, "completions/mean_terminated_length": 389.927734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.32408236347359, "frac_reward_zero_std": 0.140625, "grad_norm": 0.3211277208855997, "kl": 0.0904541015625, "learning_rate": 3.832668375104312e-06, "loss": 0.0392, "num_tokens": 85140488.0, "reward": 0.32419466972351074, "reward_std": 0.14979660511016846, "rewards/code_reward/mean": 0.22517120838165283, "rewards/code_reward/std": 0.32919344305992126, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 407.181640625, "completions/mean_terminated_length": 407.181640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3258728737690242, "frac_reward_zero_std": 0.28125, "grad_norm": 0.2731325460174965, "kl": 0.08984375, "learning_rate": 3.8198695626733725e-06, "loss": 0.0169, "num_tokens": 85652373.0, "reward": 0.26337480545043945, "reward_std": 0.11866191774606705, "rewards/code_reward/mean": 0.16415606439113617, "rewards/code_reward/std": 0.2813056707382202, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 382.01171875, "completions/mean_terminated_length": 382.01171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3276633840644584, "frac_reward_zero_std": 0.1875, "grad_norm": 0.29966499662459944, "kl": 0.091064453125, "learning_rate": 3.8070258606583156e-06, "loss": 0.0157, "num_tokens": 86162011.0, "reward": 0.31865930557250977, "reward_std": 0.1545526534318924, "rewards/code_reward/mean": 0.21885459125041962, "rewards/code_reward/std": 0.33023956418037415, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 381.765625, "completions/mean_terminated_length": 381.765625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.3294538943598926, "frac_reward_zero_std": 0.1875, "grad_norm": 0.30773797092609967, "kl": 0.0887451171875, "learning_rate": 3.7941378079551544e-06, "loss": 0.0138, "num_tokens": 86664859.0, "reward": 0.3531650900840759, "reward_std": 0.1657389998435974, "rewards/code_reward/mean": 0.2535557150840759, "rewards/code_reward/std": 0.35444554686546326, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 386.017578125, "completions/mean_terminated_length": 386.017578125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3312444046553268, "frac_reward_zero_std": 0.265625, "grad_norm": 0.2977258797346693, "kl": 0.0908203125, "learning_rate": 3.7812059453207677e-06, "loss": 0.0218, "num_tokens": 87171228.0, "reward": 0.24993132054805756, "reward_std": 0.11948941648006439, "rewards/code_reward/mean": 0.15032193064689636, "rewards/code_reward/std": 0.27073201537132263, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 372.626953125, "completions/mean_terminated_length": 369.34832763671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.333034914950761, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3111233007491883, "kl": 0.09619140625, "learning_rate": 3.768230815350213e-06, "loss": 0.0327, "num_tokens": 87670701.0, "reward": 0.2650620639324188, "reward_std": 0.13734793663024902, "rewards/code_reward/mean": 0.16584332287311554, "rewards/code_reward/std": 0.2918318510055542, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 352.484375, "completions/mean_terminated_length": 352.484375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.3348254252461952, "frac_reward_zero_std": 0.3125, "grad_norm": 0.29261815912362293, "kl": 0.091552734375, "learning_rate": 3.7552129624539557e-06, "loss": 0.0254, "num_tokens": 88134229.0, "reward": 0.32921725511550903, "reward_std": 0.1672164499759674, "rewards/code_reward/mean": 0.2301938384771347, "rewards/code_reward/std": 0.3331674635410309, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 369.607421875, "completions/mean_terminated_length": 369.607421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3366159355416294, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3277863762694554, "kl": 0.096435546875, "learning_rate": 3.7421529328350316e-06, "loss": 0.0061, "num_tokens": 88654548.0, "reward": 0.34476959705352783, "reward_std": 0.16693969070911407, "rewards/code_reward/mean": 0.24555082619190216, "rewards/code_reward/std": 0.33154475688934326, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 340.1953125, "completions/mean_terminated_length": 340.1953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3384064458370636, "frac_reward_zero_std": 0.171875, "grad_norm": 0.36419765674915167, "kl": 0.0986328125, "learning_rate": 3.7290512744661274e-06, "loss": 0.0098, "num_tokens": 89146096.0, "reward": 0.2718578577041626, "reward_std": 0.13139553368091583, "rewards/code_reward/mean": 0.17205318808555603, "rewards/code_reward/std": 0.28747817873954773, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 339.7109375, "completions/mean_terminated_length": 339.7109375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.3401969561324978, "frac_reward_zero_std": 0.15625, "grad_norm": 0.33896548237007795, "kl": 0.09814453125, "learning_rate": 3.715908537066589e-06, "loss": 0.0383, "num_tokens": 89638556.0, "reward": 0.2974991202354431, "reward_std": 0.14954832196235657, "rewards/code_reward/mean": 0.19828037917613983, "rewards/code_reward/std": 0.3269672393798828, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 347.40234375, "completions/mean_terminated_length": 347.40234375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.341987466427932, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3358751321393763, "kl": 0.108642578125, "learning_rate": 3.7027252720793538e-06, "loss": 0.0122, "num_tokens": 90125874.0, "reward": 0.2984338402748108, "reward_std": 0.15047124028205872, "rewards/code_reward/mean": 0.19901975989341736, "rewards/code_reward/std": 0.3216709792613983, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 344.947265625, "completions/mean_terminated_length": 344.947265625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3437779767233662, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3306642025703053, "kl": 0.10009765625, "learning_rate": 3.689502032647817e-06, "loss": 0.0003, "num_tokens": 90612567.0, "reward": 0.2741374969482422, "reward_std": 0.15234723687171936, "rewards/code_reward/mean": 0.175504669547081, "rewards/code_reward/std": 0.2910241484642029, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 329.017578125, "completions/mean_terminated_length": 329.017578125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3455684870188004, "frac_reward_zero_std": 0.15625, "grad_norm": 0.34551403160943134, "kl": 0.105224609375, "learning_rate": 3.6762393735926245e-06, "loss": 0.0053, "num_tokens": 91081616.0, "reward": 0.33340561389923096, "reward_std": 0.1956927478313446, "rewards/code_reward/mean": 0.23418684303760529, "rewards/code_reward/std": 0.33495593070983887, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 325.22265625, "completions/mean_terminated_length": 325.22265625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.34735899731423453, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3399127510734623, "kl": 0.109375, "learning_rate": 3.6629378513883852e-06, "loss": 0.0193, "num_tokens": 91546706.0, "reward": 0.29080790281295776, "reward_std": 0.12690304219722748, "rewards/code_reward/mean": 0.1910032033920288, "rewards/code_reward/std": 0.2975302040576935, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 317.44140625, "completions/mean_terminated_length": 317.44140625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.34914950760966873, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3414598850786986, "kl": 0.102783203125, "learning_rate": 3.6495980241403307e-06, "loss": 0.0282, "num_tokens": 92007604.0, "reward": 0.34372201561927795, "reward_std": 0.17022624611854553, "rewards/code_reward/mean": 0.24411264061927795, "rewards/code_reward/std": 0.33568263053894043, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 306.814453125, "completions/mean_terminated_length": 303.40704345703125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.35094001790510293, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5393073136174386, "kl": 0.210693359375, "learning_rate": 3.636220451560896e-06, "loss": 0.0391, "num_tokens": 92459933.0, "reward": 0.323898047208786, "reward_std": 0.15022052824497223, "rewards/code_reward/mean": 0.22448398172855377, "rewards/code_reward/std": 0.34322190284729004, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1300.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 330.75390625, "completions/mean_terminated_length": 330.75390625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.35273052820053713, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3203209800735408, "kl": 0.1019287109375, "learning_rate": 3.622805694946235e-06, "loss": 0.0106, "num_tokens": 92932167.0, "reward": 0.2613775432109833, "reward_std": 0.11885549128055573, "rewards/code_reward/mean": 0.16176815330982208, "rewards/code_reward/std": 0.28443604707717896, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 301.00390625, "completions/mean_terminated_length": 301.00390625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.35452103849597133, "frac_reward_zero_std": 0.171875, "grad_norm": 0.37250569268531186, "kl": 0.119140625, "learning_rate": 3.609354317152667e-06, "loss": 0.0388, "num_tokens": 93363473.0, "reward": 0.3493797481060028, "reward_std": 0.1509985327720642, "rewards/code_reward/mean": 0.24977034330368042, "rewards/code_reward/std": 0.29251211881637573, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.35631154879140553, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3611792626496622, "kl": 0.107177734375, "learning_rate": 3.595866882573063e-06, "loss": 0.0094, "num_tokens": 93822169.0, "reward": 0.34343421459198, "reward_std": 0.14315587282180786, "rewards/code_reward/mean": 0.2442154735326767, "rewards/code_reward/std": 0.32000935077667236, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 310.75390625, "completions/mean_terminated_length": 307.3542175292969, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.35810205908683973, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3225447174442003, "kl": 0.107421875, "learning_rate": 3.5823439571131675e-06, "loss": 0.009, "num_tokens": 94282707.0, "reward": 0.3178245723247528, "reward_std": 0.1091185063123703, "rewards/code_reward/mean": 0.21841050684452057, "rewards/code_reward/std": 0.30819234251976013, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 310.59765625, "completions/mean_terminated_length": 310.59765625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.35989256938227393, "frac_reward_zero_std": 0.171875, "grad_norm": 0.357839586003929, "kl": 0.119384765625, "learning_rate": 3.5687861081678477e-06, "loss": 0.0544, "num_tokens": 94779413.0, "reward": 0.34197884798049927, "reward_std": 0.18890967965126038, "rewards/code_reward/mean": 0.24256478250026703, "rewards/code_reward/std": 0.34847474098205566, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 308.1875, "completions/mean_terminated_length": 304.78277587890625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.36168307967770813, "frac_reward_zero_std": 0.140625, "grad_norm": 0.35743321420785423, "kl": 0.1087646484375, "learning_rate": 3.555193904597291e-06, "loss": 0.0222, "num_tokens": 95227149.0, "reward": 0.34482747316360474, "reward_std": 0.20327197015285492, "rewards/code_reward/mean": 0.24560871720314026, "rewards/code_reward/std": 0.35391858220100403, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 298.984375, "completions/mean_terminated_length": 298.984375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.36347358997314233, "frac_reward_zero_std": 0.125, "grad_norm": 0.36388154727367283, "kl": 0.113037109375, "learning_rate": 3.541567916703138e-06, "loss": 0.0185, "num_tokens": 95672829.0, "reward": 0.3561699390411377, "reward_std": 0.16193102300167084, "rewards/code_reward/mean": 0.25714653730392456, "rewards/code_reward/std": 0.3306407034397125, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 293.5625, "completions/mean_terminated_length": 293.5625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.36526410026857653, "frac_reward_zero_std": 0.109375, "grad_norm": 0.3561597798417023, "kl": 0.1131591796875, "learning_rate": 3.5279087162045517e-06, "loss": 0.0074, "num_tokens": 96113133.0, "reward": 0.3206658959388733, "reward_std": 0.17331650853157043, "rewards/code_reward/mean": 0.22164246439933777, "rewards/code_reward/std": 0.3034915030002594, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 290.796875, "completions/mean_terminated_length": 290.796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.36705461056401073, "frac_reward_zero_std": 0.125, "grad_norm": 0.3798886333600658, "kl": 0.11669921875, "learning_rate": 3.5142168762142265e-06, "loss": 0.0252, "num_tokens": 96555661.0, "reward": 0.32119131088256836, "reward_std": 0.1440931260585785, "rewards/code_reward/mean": 0.22216784954071045, "rewards/code_reward/std": 0.2958155572414398, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 315.94921875, "completions/mean_terminated_length": 305.7406921386719, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.36884512085944493, "frac_reward_zero_std": 0.15625, "grad_norm": 0.352786547439289, "kl": 0.1092529296875, "learning_rate": 3.500492971214347e-06, "loss": 0.0781, "num_tokens": 97023723.0, "reward": 0.3164845108985901, "reward_std": 0.15275104343891144, "rewards/code_reward/mean": 0.2178516983985901, "rewards/code_reward/std": 0.3338899612426758, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 284.41015625, "completions/mean_terminated_length": 284.41015625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.37063563115487913, "frac_reward_zero_std": 0.125, "grad_norm": 0.3948469725907631, "kl": 0.12890625, "learning_rate": 3.48673757703248e-06, "loss": 0.0094, "num_tokens": 97455229.0, "reward": 0.32911860942840576, "reward_std": 0.18037407100200653, "rewards/code_reward/mean": 0.23068110644817352, "rewards/code_reward/std": 0.335344523191452, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 293.59375, "completions/mean_terminated_length": 290.16046142578125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.37242614145031333, "frac_reward_zero_std": 0.09375, "grad_norm": 0.3593149708980816, "kl": 0.11669921875, "learning_rate": 3.472951270817418e-06, "loss": 0.0307, "num_tokens": 97931005.0, "reward": 0.3536415994167328, "reward_std": 0.1931881457567215, "rewards/code_reward/mean": 0.25481346249580383, "rewards/code_reward/std": 0.32725730538368225, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 292.228515625, "completions/mean_terminated_length": 285.3431396484375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.37421665174574753, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3441734366205396, "kl": 0.1181640625, "learning_rate": 3.4591346310149578e-06, "loss": 0.0318, "num_tokens": 98400090.0, "reward": 0.34109294414520264, "reward_std": 0.17372426390647888, "rewards/code_reward/mean": 0.24285078048706055, "rewards/code_reward/std": 0.327332466840744, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 275.208984375, "completions/mean_terminated_length": 273.6653747558594, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.37600716204118173, "frac_reward_zero_std": 0.09375, "grad_norm": 3.0314699495057935, "kl": 0.134765625, "learning_rate": 3.445288237343632e-06, "loss": 0.0466, "num_tokens": 98838293.0, "reward": 0.3930898904800415, "reward_std": 0.20035672187805176, "rewards/code_reward/mean": 0.29387110471725464, "rewards/code_reward/std": 0.35764992237091064, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 277.302734375, "completions/mean_terminated_length": 277.302734375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.37779767233661593, "frac_reward_zero_std": 0.09375, "grad_norm": 0.37130865243666117, "kl": 0.1173095703125, "learning_rate": 3.4314126707703895e-06, "loss": 0.0281, "num_tokens": 99271800.0, "reward": 0.36802494525909424, "reward_std": 0.18693706393241882, "rewards/code_reward/mean": 0.2682202458381653, "rewards/code_reward/std": 0.3701492249965668, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 268.3515625, "completions/mean_terminated_length": 268.3515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.37958818263205013, "frac_reward_zero_std": 0.21875, "grad_norm": 0.35721490869168004, "kl": 0.135498046875, "learning_rate": 3.4175085134862128e-06, "loss": 0.027, "num_tokens": 99687196.0, "reward": 0.40869516134262085, "reward_std": 0.15164512395858765, "rewards/code_reward/mean": 0.309476375579834, "rewards/code_reward/std": 0.3674429953098297, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 262.705078125, "completions/mean_terminated_length": 259.2113342285156, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.38137869292748433, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3812195466158581, "kl": 0.12890625, "learning_rate": 3.4035763488816953e-06, "loss": 0.0453, "num_tokens": 100131485.0, "reward": 0.3470005989074707, "reward_std": 0.1461091935634613, "rewards/code_reward/mean": 0.2479771375656128, "rewards/code_reward/std": 0.33129552006721497, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 297.189453125, "completions/mean_terminated_length": 297.189453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.38316920322291853, "frac_reward_zero_std": 0.078125, "grad_norm": 0.3779753616772237, "kl": 0.1290283203125, "learning_rate": 3.3896167615225594e-06, "loss": 0.0048, "num_tokens": 100595462.0, "reward": 0.3502082824707031, "reward_std": 0.16613370180130005, "rewards/code_reward/mean": 0.2511848509311676, "rewards/code_reward/std": 0.33614709973335266, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 293.578125, "completions/mean_terminated_length": 286.69805908203125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.38495971351835273, "frac_reward_zero_std": 0.171875, "grad_norm": 0.34817461753471735, "kl": 0.136962890625, "learning_rate": 3.375630337125133e-06, "loss": 0.0569, "num_tokens": 101071374.0, "reward": 0.42750322818756104, "reward_std": 0.16689267754554749, "rewards/code_reward/mean": 0.3296516537666321, "rewards/code_reward/std": 0.3745054006576538, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 294.982421875, "completions/mean_terminated_length": 284.6502990722656, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.38675022381378693, "frac_reward_zero_std": 0.125, "grad_norm": 0.3844586125314766, "kl": 0.1282958984375, "learning_rate": 3.361617662531772e-06, "loss": 0.0621, "num_tokens": 101532925.0, "reward": 0.37287285923957825, "reward_std": 0.14533773064613342, "rewards/code_reward/mean": 0.2744353413581848, "rewards/code_reward/std": 0.35664790868759155, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 294.40625, "completions/mean_terminated_length": 287.5294189453125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.38854073410922113, "frac_reward_zero_std": 0.203125, "grad_norm": 0.35508963071106875, "kl": 0.136962890625, "learning_rate": 3.347579325686237e-06, "loss": 0.0591, "num_tokens": 102026949.0, "reward": 0.4594467282295227, "reward_std": 0.164394348859787, "rewards/code_reward/mean": 0.36159512400627136, "rewards/code_reward/std": 0.3968314230442047, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 264.482421875, "completions/mean_terminated_length": 253.9705352783203, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.39033124440465533, "frac_reward_zero_std": 0.109375, "grad_norm": 0.36859952595136003, "kl": 0.1300048828125, "learning_rate": 3.333515915609027e-06, "loss": 0.082, "num_tokens": 102455044.0, "reward": 0.3543703556060791, "reward_std": 0.14556902647018433, "rewards/code_reward/mean": 0.2563234567642212, "rewards/code_reward/std": 0.3398793339729309, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 267.7578125, "completions/mean_terminated_length": 264.2739562988281, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.39212175470008953, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3676956995353152, "kl": 0.133056640625, "learning_rate": 3.3194280223726616e-06, "loss": 0.0208, "num_tokens": 102887888.0, "reward": 0.4334809482097626, "reward_std": 0.1961210072040558, "rewards/code_reward/mean": 0.3342621922492981, "rewards/code_reward/std": 0.37071192264556885, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 257.087890625, "completions/mean_terminated_length": 257.087890625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.39391226499552373, "frac_reward_zero_std": 0.15625, "grad_norm": 0.3749793648691599, "kl": 0.139892578125, "learning_rate": 3.305316237076927e-06, "loss": 0.0076, "num_tokens": 103358157.0, "reward": 0.45818227529525757, "reward_std": 0.1924254596233368, "rewards/code_reward/mean": 0.3589635491371155, "rewards/code_reward/std": 0.39055144786834717, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 258.1640625, "completions/mean_terminated_length": 258.1640625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.39570277529095793, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4089389548104344, "kl": 0.14208984375, "learning_rate": 3.291181151824071e-06, "loss": 0.0435, "num_tokens": 103797945.0, "reward": 0.4638136625289917, "reward_std": 0.13592302799224854, "rewards/code_reward/mean": 0.3655714988708496, "rewards/code_reward/std": 0.4041604697704315, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 243.80078125, "completions/mean_terminated_length": 243.80078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.39749328558639213, "frac_reward_zero_std": 0.140625, "grad_norm": 0.42282267060141226, "kl": 0.16455078125, "learning_rate": 3.27702335969396e-06, "loss": 0.0302, "num_tokens": 104214323.0, "reward": 0.39971253275871277, "reward_std": 0.13481120765209198, "rewards/code_reward/mean": 0.30068910121917725, "rewards/code_reward/std": 0.3456316888332367, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 262.0625, "completions/mean_terminated_length": 262.0625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.39928379588182633, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3761467600155683, "kl": 0.14306640625, "learning_rate": 3.2628434547191985e-06, "loss": 0.0373, "num_tokens": 104660827.0, "reward": 0.4002808928489685, "reward_std": 0.12809017300605774, "rewards/code_reward/mean": 0.3006714880466461, "rewards/code_reward/std": 0.33885812759399414, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 264.052734375, "completions/mean_terminated_length": 264.052734375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.40107430617726053, "frac_reward_zero_std": 0.1875, "grad_norm": 0.38067282388160567, "kl": 0.140380859375, "learning_rate": 3.2486420318601973e-06, "loss": 0.0027, "num_tokens": 105113438.0, "reward": 0.49670183658599854, "reward_std": 0.16555222868919373, "rewards/code_reward/mean": 0.39670178294181824, "rewards/code_reward/std": 0.39365339279174805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 233.529296875, "completions/mean_terminated_length": 233.529296875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.40286481647269473, "frac_reward_zero_std": 0.265625, "grad_norm": 0.4084076004785319, "kl": 0.1650390625, "learning_rate": 3.2344196869802187e-06, "loss": 0.0252, "num_tokens": 105546901.0, "reward": 0.5064142942428589, "reward_std": 0.13262632489204407, "rewards/code_reward/mean": 0.40700018405914307, "rewards/code_reward/std": 0.38433346152305603, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 251.953125, "completions/mean_terminated_length": 244.90982055664062, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.40465532676812893, "frac_reward_zero_std": 0.125, "grad_norm": 0.40384947717885644, "kl": 0.1484375, "learning_rate": 3.2201770168203694e-06, "loss": 0.0263, "num_tokens": 105990829.0, "reward": 0.4535852074623108, "reward_std": 0.17451812326908112, "rewards/code_reward/mean": 0.3543664515018463, "rewards/code_reward/std": 0.35064804553985596, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 240.09375, "completions/mean_terminated_length": 240.09375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.40644583706356313, "frac_reward_zero_std": 0.15625, "grad_norm": 0.41659461330599873, "kl": 0.149169921875, "learning_rate": 3.205914618974563e-06, "loss": 0.005, "num_tokens": 106407565.0, "reward": 0.4618646502494812, "reward_std": 0.1616652011871338, "rewards/code_reward/mean": 0.3622552752494812, "rewards/code_reward/std": 0.37777259945869446, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 242.9453125, "completions/mean_terminated_length": 242.9453125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.40823634735899733, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3985135487986338, "kl": 0.16015625, "learning_rate": 3.1916330918644496e-06, "loss": 0.0152, "num_tokens": 106840737.0, "reward": 0.5227299332618713, "reward_std": 0.17244362831115723, "rewards/code_reward/mean": 0.4233158826828003, "rewards/code_reward/std": 0.40868815779685974, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 231.611328125, "completions/mean_terminated_length": 231.611328125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.41002685765443153, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4205605099907766, "kl": 0.159912109375, "learning_rate": 3.177333034714303e-06, "loss": -0.0005, "num_tokens": 107250930.0, "reward": 0.4783470034599304, "reward_std": 0.17637872695922852, "rewards/code_reward/mean": 0.3793235719203949, "rewards/code_reward/std": 0.3728424906730652, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 233.384765625, "completions/mean_terminated_length": 229.8336639404297, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.41181736794986573, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3823336197633164, "kl": 0.162841796875, "learning_rate": 3.1630150475258813e-06, "loss": 0.0641, "num_tokens": 107678279.0, "reward": 0.49588334560394287, "reward_std": 0.09246416389942169, "rewards/code_reward/mean": 0.39685988426208496, "rewards/code_reward/std": 0.3807419538497925, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 235.607421875, "completions/mean_terminated_length": 232.0606689453125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.41360787824529993, "frac_reward_zero_std": 0.203125, "grad_norm": 0.4266164147066108, "kl": 0.1630859375, "learning_rate": 3.148679731053252e-06, "loss": 0.0555, "num_tokens": 108112550.0, "reward": 0.4955623149871826, "reward_std": 0.1304398775100708, "rewards/code_reward/mean": 0.3957575857639313, "rewards/code_reward/std": 0.38638731837272644, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 207.29296875, "completions/mean_terminated_length": 207.29296875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.41539838854073413, "frac_reward_zero_std": 0.234375, "grad_norm": 0.44035596087741957, "kl": 0.17578125, "learning_rate": 3.1343276867775805e-06, "loss": 0.0252, "num_tokens": 108501196.0, "reward": 0.4496077001094818, "reward_std": 0.13548870384693146, "rewards/code_reward/mean": 0.3505842685699463, "rewards/code_reward/std": 0.3774520754814148, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 210.765625, "completions/mean_terminated_length": 210.765625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.41718889883616833, "frac_reward_zero_std": 0.21875, "grad_norm": 0.43440257686208744, "kl": 0.15966796875, "learning_rate": 3.1199595168819043e-06, "loss": 0.0397, "num_tokens": 108913028.0, "reward": 0.45241600275039673, "reward_std": 0.13097399473190308, "rewards/code_reward/mean": 0.3526113033294678, "rewards/code_reward/std": 0.3926211893558502, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 196.857421875, "completions/mean_terminated_length": 196.857421875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.41897940913160253, "frac_reward_zero_std": 0.25, "grad_norm": 0.5129014262706565, "kl": 0.216552734375, "learning_rate": 3.105575824225852e-06, "loss": 0.0229, "num_tokens": 109319275.0, "reward": 0.4432060122489929, "reward_std": 0.13625825941562653, "rewards/code_reward/mean": 0.34359660744667053, "rewards/code_reward/std": 0.3494217097759247, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 209.37890625, "completions/mean_terminated_length": 208.93150329589844, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.42076991942703673, "frac_reward_zero_std": 0.21875, "grad_norm": 0.9728476627584296, "kl": 0.72119140625, "learning_rate": 3.091177212320363e-06, "loss": 0.0187, "num_tokens": 109727237.0, "reward": 0.3970165550708771, "reward_std": 0.11434811353683472, "rewards/code_reward/mean": 0.2981884479522705, "rewards/code_reward/std": 0.35641324520111084, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 192.134765625, "completions/mean_terminated_length": 192.134765625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.42256042972247093, "frac_reward_zero_std": 0.21875, "grad_norm": 0.4287587514631123, "kl": 0.175537109375, "learning_rate": 3.0767642853023538e-06, "loss": 0.0177, "num_tokens": 110130010.0, "reward": 0.527762770652771, "reward_std": 0.13451485335826874, "rewards/code_reward/mean": 0.4287393093109131, "rewards/code_reward/std": 0.3786221146583557, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 197.400390625, "completions/mean_terminated_length": 197.400390625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.4243509400179051, "frac_reward_zero_std": 0.296875, "grad_norm": 0.4134167578366523, "kl": 0.18896484375, "learning_rate": 3.062337647909376e-06, "loss": -0.0038, "num_tokens": 110526423.0, "reward": 0.44963058829307556, "reward_std": 0.08041568845510483, "rewards/code_reward/mean": 0.35021650791168213, "rewards/code_reward/std": 0.37951287627220154, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 198.640625, "completions/mean_terminated_length": 198.640625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.4261414503133393, "frac_reward_zero_std": 0.203125, "grad_norm": 0.425884523161256, "kl": 0.190673828125, "learning_rate": 3.04789790545424e-06, "loss": 0.0374, "num_tokens": 110925703.0, "reward": 0.45294564962387085, "reward_std": 0.11002074927091599, "rewards/code_reward/mean": 0.3535315990447998, "rewards/code_reward/std": 0.36082980036735535, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 199.361328125, "completions/mean_terminated_length": 195.74363708496094, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4279319606087735, "frac_reward_zero_std": 0.296875, "grad_norm": 0.5104317961340997, "kl": 0.2236328125, "learning_rate": 3.033445663799621e-06, "loss": 0.0281, "num_tokens": 111305432.0, "reward": 0.5253987908363342, "reward_std": 0.1264045685529709, "rewards/code_reward/mean": 0.4259846806526184, "rewards/code_reward/std": 0.38333970308303833, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 202.984375, "completions/mean_terminated_length": 199.373779296875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4297224709042077, "frac_reward_zero_std": 0.1875, "grad_norm": 0.47009827228058687, "kl": 0.196533203125, "learning_rate": 3.018981529332633e-06, "loss": 0.0523, "num_tokens": 111719568.0, "reward": 0.4501265287399292, "reward_std": 0.11718727648258209, "rewards/code_reward/mean": 0.3505171537399292, "rewards/code_reward/std": 0.3486543893814087, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 197.9375, "completions/mean_terminated_length": 197.9375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4315129811996419, "frac_reward_zero_std": 0.328125, "grad_norm": 0.39505392193477995, "kl": 0.180419921875, "learning_rate": 3.00450610893939e-06, "loss": 0.0068, "num_tokens": 112114880.0, "reward": 0.4074530005455017, "reward_std": 0.10436570644378662, "rewards/code_reward/mean": 0.30862486362457275, "rewards/code_reward/std": 0.3525501787662506, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 198.970703125, "completions/mean_terminated_length": 198.970703125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4333034914950761, "frac_reward_zero_std": 0.171875, "grad_norm": 0.42493354399376637, "kl": 0.18017578125, "learning_rate": 2.9900200099795396e-06, "loss": 0.0179, "num_tokens": 112503329.0, "reward": 0.42015478014945984, "reward_std": 0.11209665983915329, "rewards/code_reward/mean": 0.32093602418899536, "rewards/code_reward/std": 0.3432243764400482, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 212.50390625, "completions/mean_terminated_length": 212.50390625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4350940017905103, "frac_reward_zero_std": 0.28125, "grad_norm": 0.38807054120719386, "kl": 0.16943359375, "learning_rate": 2.9755238402607826e-06, "loss": 0.0252, "num_tokens": 112922643.0, "reward": 0.43005281686782837, "reward_std": 0.0921945869922638, "rewards/code_reward/mean": 0.33102935552597046, "rewards/code_reward/std": 0.3609362840652466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 211.66015625, "completions/mean_terminated_length": 204.45883178710938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4368845120859445, "frac_reward_zero_std": 0.328125, "grad_norm": 0.37098509360478776, "kl": 0.1796875, "learning_rate": 2.961018208013367e-06, "loss": 0.0733, "num_tokens": 113343813.0, "reward": 0.518032431602478, "reward_std": 0.12325772643089294, "rewards/code_reward/mean": 0.4195949137210846, "rewards/code_reward/std": 0.4044349193572998, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 224.060546875, "completions/mean_terminated_length": 224.060546875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4386750223813787, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3661559429795215, "kl": 0.159912109375, "learning_rate": 2.9465037218645694e-06, "loss": 0.0157, "num_tokens": 113737948.0, "reward": 0.5257256627082825, "reward_std": 0.12493546307086945, "rewards/code_reward/mean": 0.4259209632873535, "rewards/code_reward/std": 0.40468913316726685, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 230.541015625, "completions/mean_terminated_length": 230.541015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4404655326768129, "frac_reward_zero_std": 0.1875, "grad_norm": 0.39747229801742967, "kl": 0.179931640625, "learning_rate": 2.9319809908131604e-06, "loss": 0.0134, "num_tokens": 114160825.0, "reward": 0.4408986270427704, "reward_std": 0.10659031569957733, "rewards/code_reward/mean": 0.342265784740448, "rewards/code_reward/std": 0.37365952134132385, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 252.1015625, "completions/mean_terminated_length": 252.1015625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4422560429722471, "frac_reward_zero_std": 0.171875, "grad_norm": 0.4285628093947806, "kl": 0.1748046875, "learning_rate": 2.917450624203847e-06, "loss": 0.0359, "num_tokens": 114635701.0, "reward": 0.4481284022331238, "reward_std": 0.11817815154790878, "rewards/code_reward/mean": 0.34871435165405273, "rewards/code_reward/std": 0.3624989688396454, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 247.34765625, "completions/mean_terminated_length": 240.28628540039062, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.4440465532676813, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3734520815548418, "kl": 0.161376953125, "learning_rate": 2.9029132317017118e-06, "loss": 0.0885, "num_tokens": 115076423.0, "reward": 0.43092256784439087, "reward_std": 0.13719581067562103, "rewards/code_reward/mean": 0.3320944309234619, "rewards/code_reward/std": 0.3568250834941864, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 235.763671875, "completions/mean_terminated_length": 235.763671875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.4458370635631155, "frac_reward_zero_std": 0.15625, "grad_norm": 0.4060023368006606, "kl": 0.193359375, "learning_rate": 2.888369423266629e-06, "loss": 0.0357, "num_tokens": 115514766.0, "reward": 0.51524418592453, "reward_std": 0.1388806849718094, "rewards/code_reward/mean": 0.4162207245826721, "rewards/code_reward/std": 0.3517437279224396, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 238.10546875, "completions/mean_terminated_length": 238.10546875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4476275738585497, "frac_reward_zero_std": 0.21875, "grad_norm": 0.36689277105225304, "kl": 0.16357421875, "learning_rate": 2.8738198091276712e-06, "loss": 0.0319, "num_tokens": 115933452.0, "reward": 0.44425785541534424, "reward_std": 0.13341215252876282, "rewards/code_reward/mean": 0.3448438048362732, "rewards/code_reward/std": 0.37386608123779297, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 237.5703125, "completions/mean_terminated_length": 234.0273895263672, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.4494180841539839, "frac_reward_zero_std": 0.203125, "grad_norm": 0.4020894972876976, "kl": 0.18408203125, "learning_rate": 2.859264999757509e-06, "loss": 0.0014, "num_tokens": 116335152.0, "reward": 0.43313419818878174, "reward_std": 0.1194508969783783, "rewards/code_reward/mean": 0.33391544222831726, "rewards/code_reward/std": 0.3682665228843689, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 248.09765625, "completions/mean_terminated_length": 244.57534790039062, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4512085944494181, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3534244573091954, "kl": 0.157470703125, "learning_rate": 2.8447056058467928e-06, "loss": 0.04, "num_tokens": 116795546.0, "reward": 0.4347473084926605, "reward_std": 0.10823085904121399, "rewards/code_reward/mean": 0.3357238471508026, "rewards/code_reward/std": 0.362831711769104, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1003.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 242.84765625, "completions/mean_terminated_length": 241.36007690429688, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4529991047448523, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3328948749912819, "kl": 0.146484375, "learning_rate": 2.830142238278531e-06, "loss": 0.0246, "num_tokens": 117193140.0, "reward": 0.4876568913459778, "reward_std": 0.120115265250206, "rewards/code_reward/mean": 0.38824284076690674, "rewards/code_reward/std": 0.38715317845344543, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 246.947265625, "completions/mean_terminated_length": 246.947265625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.4547896150402865, "frac_reward_zero_std": 0.296875, "grad_norm": 0.35183328881334314, "kl": 0.15673828125, "learning_rate": 2.81557550810246e-06, "loss": 0.0291, "num_tokens": 117619513.0, "reward": 0.5046653747558594, "reward_std": 0.11497897654771805, "rewards/code_reward/mean": 0.4050559401512146, "rewards/code_reward/std": 0.36989960074424744, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 274.9140625, "completions/mean_terminated_length": 271.4442138671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.4565801253357207, "frac_reward_zero_std": 0.21875, "grad_norm": 0.33822341854562216, "kl": 0.182861328125, "learning_rate": 2.8010060265094026e-06, "loss": 0.0343, "num_tokens": 118077445.0, "reward": 0.5210433006286621, "reward_std": 0.15960867702960968, "rewards/code_reward/mean": 0.42182451486587524, "rewards/code_reward/std": 0.38566353917121887, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 255.291015625, "completions/mean_terminated_length": 255.291015625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4583706356311549, "frac_reward_zero_std": 0.296875, "grad_norm": 0.36475958990902807, "kl": 0.162109375, "learning_rate": 2.786434404805629e-06, "loss": 0.0375, "num_tokens": 118513858.0, "reward": 0.4802195727825165, "reward_std": 0.09935116022825241, "rewards/code_reward/mean": 0.3806101977825165, "rewards/code_reward/std": 0.3838985562324524, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 261.48828125, "completions/mean_terminated_length": 257.9921569824219, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4601611459265891, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3708761489545171, "kl": 0.15234375, "learning_rate": 2.771861254387199e-06, "loss": 0.0361, "num_tokens": 118937156.0, "reward": 0.4351547956466675, "reward_std": 0.13754497468471527, "rewards/code_reward/mean": 0.33613133430480957, "rewards/code_reward/std": 0.34836623072624207, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 271.10546875, "completions/mean_terminated_length": 271.10546875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.4619516562220233, "frac_reward_zero_std": 0.21875, "grad_norm": 0.36350645859651187, "kl": 0.164306640625, "learning_rate": 2.7572871867143204e-06, "loss": 0.0261, "num_tokens": 119414986.0, "reward": 0.4408857226371765, "reward_std": 0.10210537910461426, "rewards/code_reward/mean": 0.3414716422557831, "rewards/code_reward/std": 0.35740023851394653, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 259.853515625, "completions/mean_terminated_length": 259.853515625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4637421665174575, "frac_reward_zero_std": 0.125, "grad_norm": 0.4177235267986355, "kl": 0.18212890625, "learning_rate": 2.742712813285681e-06, "loss": 0.0504, "num_tokens": 119877815.0, "reward": 0.5032874345779419, "reward_std": 0.17334052920341492, "rewards/code_reward/mean": 0.404263973236084, "rewards/code_reward/std": 0.37556812167167664, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 270.69140625, "completions/mean_terminated_length": 270.69140625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4655326768128917, "frac_reward_zero_std": 0.203125, "grad_norm": 0.6103436105896789, "kl": 0.244384765625, "learning_rate": 2.7281387456128017e-06, "loss": 0.0322, "num_tokens": 120304249.0, "reward": 0.43926751613616943, "reward_std": 0.10499735176563263, "rewards/code_reward/mean": 0.33946284651756287, "rewards/code_reward/std": 0.33817508816719055, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 262.669921875, "completions/mean_terminated_length": 262.669921875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4673231871083259, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3738493896948031, "kl": 0.159423828125, "learning_rate": 2.7135655951943716e-06, "loss": 0.0089, "num_tokens": 120761224.0, "reward": 0.42706286907196045, "reward_std": 0.08372801542282104, "rewards/code_reward/mean": 0.3276488184928894, "rewards/code_reward/std": 0.3480439782142639, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 269.001953125, "completions/mean_terminated_length": 262.0255126953125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4691136974037601, "frac_reward_zero_std": 0.328125, "grad_norm": 0.32544692430733607, "kl": 0.144775390625, "learning_rate": 2.698993973490598e-06, "loss": 0.0485, "num_tokens": 121176185.0, "reward": 0.5194603204727173, "reward_std": 0.09980335831642151, "rewards/code_reward/mean": 0.4202415347099304, "rewards/code_reward/std": 0.37102240324020386, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 254.32421875, "completions/mean_terminated_length": 254.32421875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4709042076991943, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3742367544683102, "kl": 0.158447265625, "learning_rate": 2.6844244918975416e-06, "loss": 0.0199, "num_tokens": 121609567.0, "reward": 0.47886574268341064, "reward_std": 0.08044504374265671, "rewards/code_reward/mean": 0.3796469569206238, "rewards/code_reward/std": 0.36523422598838806, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 252.798828125, "completions/mean_terminated_length": 252.798828125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.4726947179946285, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3963452954926581, "kl": 0.171630859375, "learning_rate": 2.66985776172147e-06, "loss": 0.0131, "num_tokens": 122042384.0, "reward": 0.47197479009628296, "reward_std": 0.11795878410339355, "rewards/code_reward/mean": 0.3727560341358185, "rewards/code_reward/std": 0.37341251969337463, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 262.2265625, "completions/mean_terminated_length": 262.2265625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4744852282900627, "frac_reward_zero_std": 0.234375, "grad_norm": 0.35622746942339073, "kl": 0.16162109375, "learning_rate": 2.6552943941532088e-06, "loss": 0.0244, "num_tokens": 122479740.0, "reward": 0.4252495765686035, "reward_std": 0.10891364514827728, "rewards/code_reward/mean": 0.3258354961872101, "rewards/code_reward/std": 0.3643730580806732, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 260.318359375, "completions/mean_terminated_length": 256.8199462890625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4762757385854969, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3683364869484586, "kl": 0.147216796875, "learning_rate": 2.6407350002424927e-06, "loss": 0.0426, "num_tokens": 122906263.0, "reward": 0.46870866417884827, "reward_std": 0.1297549456357956, "rewards/code_reward/mean": 0.3694899082183838, "rewards/code_reward/std": 0.37440669536590576, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 258.078125, "completions/mean_terminated_length": 258.078125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.4780662488809311, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3767409080423426, "kl": 0.18115234375, "learning_rate": 2.626180190872329e-06, "loss": 0.0193, "num_tokens": 123343871.0, "reward": 0.43553704023361206, "reward_std": 0.13232161104679108, "rewards/code_reward/mean": 0.33709949254989624, "rewards/code_reward/std": 0.3650631308555603, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 263.953125, "completions/mean_terminated_length": 263.953125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4798567591763653, "frac_reward_zero_std": 0.28125, "grad_norm": 0.34412540266381986, "kl": 0.1552734375, "learning_rate": 2.611630576733372e-06, "loss": 0.0028, "num_tokens": 123798175.0, "reward": 0.4602980613708496, "reward_std": 0.14586561918258667, "rewards/code_reward/mean": 0.36146992444992065, "rewards/code_reward/std": 0.3799441456794739, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 258.705078125, "completions/mean_terminated_length": 258.705078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4816472694717995, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3681911311652466, "kl": 0.150390625, "learning_rate": 2.5970867682982885e-06, "loss": -0.0004, "num_tokens": 124217464.0, "reward": 0.4652305841445923, "reward_std": 0.09535543620586395, "rewards/code_reward/mean": 0.36581650376319885, "rewards/code_reward/std": 0.351797491312027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 269.791015625, "completions/mean_terminated_length": 269.791015625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.4834377797672337, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3862486870403331, "kl": 0.173095703125, "learning_rate": 2.582549375796154e-06, "loss": 0.0138, "num_tokens": 124668613.0, "reward": 0.4482315182685852, "reward_std": 0.10594505816698074, "rewards/code_reward/mean": 0.3495987057685852, "rewards/code_reward/std": 0.3537541925907135, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 276.142578125, "completions/mean_terminated_length": 272.6751403808594, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.4852282900626679, "frac_reward_zero_std": 0.25, "grad_norm": 0.37959249861419697, "kl": 0.150146484375, "learning_rate": 2.568019009186841e-06, "loss": 0.0226, "num_tokens": 125133718.0, "reward": 0.3963351249694824, "reward_std": 0.10435068607330322, "rewards/code_reward/mean": 0.29750698804855347, "rewards/code_reward/std": 0.3483254015445709, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 263.4140625, "completions/mean_terminated_length": 263.4140625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4870188003581021, "frac_reward_zero_std": 0.21875, "grad_norm": 0.4516198699943225, "kl": 0.18212890625, "learning_rate": 2.5534962781354317e-06, "loss": 0.0, "num_tokens": 125593530.0, "reward": 0.45676475763320923, "reward_std": 0.11615435779094696, "rewards/code_reward/mean": 0.3583272695541382, "rewards/code_reward/std": 0.35742318630218506, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 259.98046875, "completions/mean_terminated_length": 259.98046875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.4888093106535363, "frac_reward_zero_std": 0.25, "grad_norm": 0.3584651737018534, "kl": 0.15283203125, "learning_rate": 2.538981791986634e-06, "loss": -0.0014, "num_tokens": 126016928.0, "reward": 0.5036612749099731, "reward_std": 0.10386145114898682, "rewards/code_reward/mean": 0.40502843260765076, "rewards/code_reward/std": 0.351330429315567, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 954.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 282.7109375, "completions/mean_terminated_length": 281.3972473144531, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.4905998209489705, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3680304043069243, "kl": 0.150146484375, "learning_rate": 2.524476159739218e-06, "loss": 0.0394, "num_tokens": 126474724.0, "reward": 0.4612274765968323, "reward_std": 0.11476659774780273, "rewards/code_reward/mean": 0.3629852533340454, "rewards/code_reward/std": 0.37470799684524536, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 291.275390625, "completions/mean_terminated_length": 287.83758544921875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.4923903312444047, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3554753792797297, "kl": 0.14794921875, "learning_rate": 2.5099799900204607e-06, "loss": 0.0457, "num_tokens": 126944265.0, "reward": 0.4623337388038635, "reward_std": 0.11638066917657852, "rewards/code_reward/mean": 0.3633102774620056, "rewards/code_reward/std": 0.36648857593536377, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 296.71484375, "completions/mean_terminated_length": 289.8470764160156, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.4941808415398389, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3619027753262256, "kl": 0.139892578125, "learning_rate": 2.4954938910606108e-06, "loss": 0.0366, "num_tokens": 127388463.0, "reward": 0.4132917523384094, "reward_std": 0.1279207468032837, "rewards/code_reward/mean": 0.3146589398384094, "rewards/code_reward/std": 0.34325623512268066, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 299.67578125, "completions/mean_terminated_length": 299.67578125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.4959713518352731, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3526750578994669, "kl": 0.14599609375, "learning_rate": 2.481018470667368e-06, "loss": 0.0039, "num_tokens": 127831257.0, "reward": 0.5036460161209106, "reward_std": 0.12540686130523682, "rewards/code_reward/mean": 0.40442726016044617, "rewards/code_reward/std": 0.38424089550971985, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 288.302734375, "completions/mean_terminated_length": 284.8591003417969, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.4977618621307073, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3435133097867181, "kl": 0.15087890625, "learning_rate": 2.4665543362003802e-06, "loss": 0.0333, "num_tokens": 128264444.0, "reward": 0.504298746585846, "reward_std": 0.14167757332324982, "rewards/code_reward/mean": 0.40566593408584595, "rewards/code_reward/std": 0.3765021562576294, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 286.115234375, "completions/mean_terminated_length": 286.115234375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.4995523724261415, "frac_reward_zero_std": 0.375, "grad_norm": 0.33178087558476266, "kl": 0.15234375, "learning_rate": 2.4521020945457615e-06, "loss": 0.0266, "num_tokens": 128762247.0, "reward": 0.576686441898346, "reward_std": 0.09095828235149384, "rewards/code_reward/mean": 0.47766298055648804, "rewards/code_reward/std": 0.4140399098396301, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 306.66796875, "completions/mean_terminated_length": 303.2602844238281, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.5013428827215757, "frac_reward_zero_std": 0.234375, "grad_norm": 0.35906951596171577, "kl": 0.14404296875, "learning_rate": 2.4376623520906255e-06, "loss": 0.0377, "num_tokens": 129232189.0, "reward": 0.5132566690444946, "reward_std": 0.1287151277065277, "rewards/code_reward/mean": 0.4142332077026367, "rewards/code_reward/std": 0.3881847560405731, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 309.228515625, "completions/mean_terminated_length": 305.8258361816406, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5031333930170099, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3452678784144985, "kl": 0.142822265625, "learning_rate": 2.4232357146976478e-06, "loss": 0.0507, "num_tokens": 129718658.0, "reward": 0.482688307762146, "reward_std": 0.09931772202253342, "rewards/code_reward/mean": 0.38386017084121704, "rewards/code_reward/std": 0.3622846007347107, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 284.666015625, "completions/mean_terminated_length": 281.21527099609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5049239033124441, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3650430699012169, "kl": 0.160400390625, "learning_rate": 2.408822787679637e-06, "loss": 0.0522, "num_tokens": 130164487.0, "reward": 0.556911826133728, "reward_std": 0.10651767253875732, "rewards/code_reward/mean": 0.4580836594104767, "rewards/code_reward/std": 0.39714059233665466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 306.21875, "completions/mean_terminated_length": 302.8101806640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5067144136078783, "frac_reward_zero_std": 0.203125, "grad_norm": 0.35809042786064404, "kl": 0.148681640625, "learning_rate": 2.3944241757741475e-06, "loss": 0.023, "num_tokens": 130645743.0, "reward": 0.46997350454330444, "reward_std": 0.11272536218166351, "rewards/code_reward/mean": 0.37075474858283997, "rewards/code_reward/std": 0.35818132758140564, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 300.818359375, "completions/mean_terminated_length": 300.818359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5085049239033125, "frac_reward_zero_std": 0.203125, "grad_norm": 0.451371014899627, "kl": 0.19970703125, "learning_rate": 2.380040483118097e-06, "loss": 0.039, "num_tokens": 131089298.0, "reward": 0.438448965549469, "reward_std": 0.11279679834842682, "rewards/code_reward/mean": 0.339816153049469, "rewards/code_reward/std": 0.34239107370376587, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 312.220703125, "completions/mean_terminated_length": 312.220703125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.5102954341987467, "frac_reward_zero_std": 0.25, "grad_norm": 0.3396624515121315, "kl": 0.152587890625, "learning_rate": 2.365672313222419e-06, "loss": 0.0275, "num_tokens": 131557115.0, "reward": 0.42950010299682617, "reward_std": 0.09493934363126755, "rewards/code_reward/mean": 0.32989072799682617, "rewards/code_reward/std": 0.3139655292034149, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 296.900390625, "completions/mean_terminated_length": 296.900390625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5120859444941809, "frac_reward_zero_std": 0.234375, "grad_norm": 0.34774635292248857, "kl": 0.152587890625, "learning_rate": 2.351320268946749e-06, "loss": -0.0063, "num_tokens": 132020960.0, "reward": 0.5267815589904785, "reward_std": 0.15310031175613403, "rewards/code_reward/mean": 0.42814868688583374, "rewards/code_reward/std": 0.38612428307533264, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 287.642578125, "completions/mean_terminated_length": 287.642578125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.5138764547896151, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3342954772630643, "kl": 0.1513671875, "learning_rate": 2.336984952474119e-06, "loss": 0.0076, "num_tokens": 132463169.0, "reward": 0.5325836539268494, "reward_std": 0.12084920704364777, "rewards/code_reward/mean": 0.4331696331501007, "rewards/code_reward/std": 0.35649585723876953, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 287.03125, "completions/mean_terminated_length": 287.03125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5156669650850493, "frac_reward_zero_std": 0.328125, "grad_norm": 0.339733139168529, "kl": 0.154541015625, "learning_rate": 2.322666965285697e-06, "loss": 0.0161, "num_tokens": 132917705.0, "reward": 0.48738783597946167, "reward_std": 0.10990999639034271, "rewards/code_reward/mean": 0.3881691098213196, "rewards/code_reward/std": 0.3600110411643982, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 278.357421875, "completions/mean_terminated_length": 271.41766357421875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5174574753804835, "frac_reward_zero_std": 0.25, "grad_norm": 0.3459860925011311, "kl": 0.165771484375, "learning_rate": 2.3083669081355507e-06, "loss": 0.0255, "num_tokens": 133354808.0, "reward": 0.5267261266708374, "reward_std": 0.12775260210037231, "rewards/code_reward/mean": 0.42789801955223083, "rewards/code_reward/std": 0.38208410143852234, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 285.638671875, "completions/mean_terminated_length": 285.638671875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5192479856759177, "frac_reward_zero_std": 0.15625, "grad_norm": 0.38281490329084394, "kl": 0.156005859375, "learning_rate": 2.2940853810254377e-06, "loss": 0.0196, "num_tokens": 133790647.0, "reward": 0.4899722933769226, "reward_std": 0.13435471057891846, "rewards/code_reward/mean": 0.3913394510746002, "rewards/code_reward/std": 0.3861147165298462, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 295.080078125, "completions/mean_terminated_length": 295.080078125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5210384959713519, "frac_reward_zero_std": 0.3125, "grad_norm": 0.35226013873852846, "kl": 0.19287109375, "learning_rate": 2.2798229831796313e-06, "loss": 0.0307, "num_tokens": 134229760.0, "reward": 0.43618863821029663, "reward_std": 0.10009077191352844, "rewards/code_reward/mean": 0.3371651768684387, "rewards/code_reward/std": 0.35083073377609253, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 295.51953125, "completions/mean_terminated_length": 288.6470642089844, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5228290062667861, "frac_reward_zero_std": 0.28125, "grad_norm": 0.32762111918112313, "kl": 0.15478515625, "learning_rate": 2.2655803130197816e-06, "loss": 0.0387, "num_tokens": 134679802.0, "reward": 0.4911244213581085, "reward_std": 0.09962437301874161, "rewards/code_reward/mean": 0.3922962546348572, "rewards/code_reward/std": 0.3483659625053406, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 288.431640625, "completions/mean_terminated_length": 288.431640625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5246195165622203, "frac_reward_zero_std": 0.1875, "grad_norm": 0.36034052861877924, "kl": 0.15869140625, "learning_rate": 2.2513579681398034e-06, "loss": 0.0164, "num_tokens": 135119399.0, "reward": 0.4396505057811737, "reward_std": 0.10546523332595825, "rewards/code_reward/mean": 0.3406270742416382, "rewards/code_reward/std": 0.3443145453929901, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 286.02734375, "completions/mean_terminated_length": 286.02734375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.5264100268576545, "frac_reward_zero_std": 0.234375, "grad_norm": 0.35953464614589087, "kl": 0.138671875, "learning_rate": 2.237156545280803e-06, "loss": 0.0391, "num_tokens": 135533221.0, "reward": 0.4606231153011322, "reward_std": 0.11393949389457703, "rewards/code_reward/mean": 0.3619903326034546, "rewards/code_reward/std": 0.3649303615093231, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 283.443359375, "completions/mean_terminated_length": 283.443359375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5282005371530887, "frac_reward_zero_std": 0.21875, "grad_norm": 0.38152863381733026, "kl": 0.14990234375, "learning_rate": 2.2229766403060403e-06, "loss": 0.0323, "num_tokens": 136002216.0, "reward": 0.5307794213294983, "reward_std": 0.13158637285232544, "rewards/code_reward/mean": 0.43136537075042725, "rewards/code_reward/std": 0.3692026138305664, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 267.0703125, "completions/mean_terminated_length": 267.0703125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5299910474485229, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3856023144195285, "kl": 0.15576171875, "learning_rate": 2.2088188481759305e-06, "loss": 0.0617, "num_tokens": 136453836.0, "reward": 0.47697803378105164, "reward_std": 0.11004804819822311, "rewards/code_reward/mean": 0.37775927782058716, "rewards/code_reward/std": 0.3664804995059967, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 264.962890625, "completions/mean_terminated_length": 264.962890625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.5317815577439571, "frac_reward_zero_std": 0.234375, "grad_norm": 0.37962706546625613, "kl": 0.158935546875, "learning_rate": 2.194683762923073e-06, "loss": 0.0241, "num_tokens": 136871073.0, "reward": 0.41250044107437134, "reward_std": 0.1235678642988205, "rewards/code_reward/mean": 0.3134769797325134, "rewards/code_reward/std": 0.33655619621276855, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 266.96484375, "completions/mean_terminated_length": 266.96484375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5335720680393913, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3394418826625973, "kl": 0.158935546875, "learning_rate": 2.1805719776273387e-06, "loss": 0.0157, "num_tokens": 137308687.0, "reward": 0.5105506181716919, "reward_std": 0.11416419595479965, "rewards/code_reward/mean": 0.41152718663215637, "rewards/code_reward/std": 0.3962363302707672, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 261.275390625, "completions/mean_terminated_length": 260.59295654296875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5353625783348255, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7885875694913055, "kl": 0.162353515625, "learning_rate": 2.166484084390974e-06, "loss": 0.0085, "num_tokens": 137746772.0, "reward": 0.4813706874847412, "reward_std": 0.09051915258169174, "rewards/code_reward/mean": 0.3823472261428833, "rewards/code_reward/std": 0.3741407096385956, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 269.236328125, "completions/mean_terminated_length": 265.75537109375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5371530886302597, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3210663274576534, "kl": 0.15087890625, "learning_rate": 2.1524206743137636e-06, "loss": 0.0377, "num_tokens": 138191741.0, "reward": 0.45145082473754883, "reward_std": 0.08185850083827972, "rewards/code_reward/mean": 0.3520367741584778, "rewards/code_reward/std": 0.3650685250759125, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 250.611328125, "completions/mean_terminated_length": 250.611328125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5389435989256938, "frac_reward_zero_std": 0.390625, "grad_norm": 0.3166490382283108, "kl": 0.1669921875, "learning_rate": 2.1383823374682287e-06, "loss": 0.0285, "num_tokens": 138624142.0, "reward": 0.4263192117214203, "reward_std": 0.07955829054117203, "rewards/code_reward/mean": 0.32690516114234924, "rewards/code_reward/std": 0.33764398097991943, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 253.48828125, "completions/mean_terminated_length": 253.48828125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.540734109221128, "frac_reward_zero_std": 0.1875, "grad_norm": 0.37919009997586817, "kl": 0.158935546875, "learning_rate": 2.124369662874868e-06, "loss": 0.0569, "num_tokens": 139066640.0, "reward": 0.457027792930603, "reward_std": 0.12578177452087402, "rewards/code_reward/mean": 0.3580043911933899, "rewards/code_reward/std": 0.3660334050655365, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 245.91015625, "completions/mean_terminated_length": 245.91015625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.5425246195165622, "frac_reward_zero_std": 0.28125, "grad_norm": 0.35859547736348485, "kl": 0.1748046875, "learning_rate": 2.110383238477441e-06, "loss": 0.0008, "num_tokens": 139489498.0, "reward": 0.5007085204124451, "reward_std": 0.10705563426017761, "rewards/code_reward/mean": 0.40168505907058716, "rewards/code_reward/std": 0.36149272322654724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 240.55859375, "completions/mean_terminated_length": 240.55859375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.5443151298119964, "frac_reward_zero_std": 0.375, "grad_norm": 0.3415373829429483, "kl": 0.16650390625, "learning_rate": 2.096423651118305e-06, "loss": -0.0015, "num_tokens": 139913280.0, "reward": 0.4641948342323303, "reward_std": 0.07166294753551483, "rewards/code_reward/mean": 0.36497604846954346, "rewards/code_reward/std": 0.3553154766559601, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 229.755859375, "completions/mean_terminated_length": 229.755859375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5461056401074306, "frac_reward_zero_std": 0.359375, "grad_norm": 0.35669566469189107, "kl": 0.169677734375, "learning_rate": 2.082491486513788e-06, "loss": 0.0236, "num_tokens": 140367523.0, "reward": 0.5036875605583191, "reward_std": 0.05965973436832428, "rewards/code_reward/mean": 0.40388286113739014, "rewards/code_reward/std": 0.36128032207489014, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 248.986328125, "completions/mean_terminated_length": 248.986328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5478961504028648, "frac_reward_zero_std": 0.375, "grad_norm": 0.32766220462241613, "kl": 0.166015625, "learning_rate": 2.0685873292296116e-06, "loss": 0.028, "num_tokens": 140806420.0, "reward": 0.5145927667617798, "reward_std": 0.0937511995434761, "rewards/code_reward/mean": 0.41517871618270874, "rewards/code_reward/std": 0.37264516949653625, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 228.9375, "completions/mean_terminated_length": 228.9375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.549686660698299, "frac_reward_zero_std": 0.25, "grad_norm": 0.41526132876738064, "kl": 0.1982421875, "learning_rate": 2.054711762656369e-06, "loss": -0.005, "num_tokens": 141212324.0, "reward": 0.4458354413509369, "reward_std": 0.08804000914096832, "rewards/code_reward/mean": 0.3460307717323303, "rewards/code_reward/std": 0.36334750056266785, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 230.73046875, "completions/mean_terminated_length": 230.73046875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5514771709937332, "frac_reward_zero_std": 0.390625, "grad_norm": 0.3588174335014873, "kl": 0.173583984375, "learning_rate": 2.040865368985044e-06, "loss": 0.0165, "num_tokens": 141605546.0, "reward": 0.4880799651145935, "reward_std": 0.09602624922990799, "rewards/code_reward/mean": 0.3886658549308777, "rewards/code_reward/std": 0.38787147402763367, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 216.005859375, "completions/mean_terminated_length": 216.005859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.5532676812891674, "frac_reward_zero_std": 0.375, "grad_norm": 0.38286445996408375, "kl": 0.1923828125, "learning_rate": 2.027048729182583e-06, "loss": 0.0233, "num_tokens": 142010261.0, "reward": 0.4848613739013672, "reward_std": 0.08099614828824997, "rewards/code_reward/mean": 0.385056734085083, "rewards/code_reward/std": 0.3716621398925781, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 241.9765625, "completions/mean_terminated_length": 241.9765625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5550581915846016, "frac_reward_zero_std": 0.390625, "grad_norm": 0.35073245293846, "kl": 0.174072265625, "learning_rate": 2.0132624229675205e-06, "loss": 0.0102, "num_tokens": 142469569.0, "reward": 0.5216706991195679, "reward_std": 0.10839828103780746, "rewards/code_reward/mean": 0.42225661873817444, "rewards/code_reward/std": 0.3848792314529419, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 235.10546875, "completions/mean_terminated_length": 235.10546875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5568487018800358, "frac_reward_zero_std": 0.40625, "grad_norm": 0.31386748107323054, "kl": 0.186279296875, "learning_rate": 1.9995070287856546e-06, "loss": 0.0019, "num_tokens": 142885847.0, "reward": 0.491472065448761, "reward_std": 0.08415723592042923, "rewards/code_reward/mean": 0.391862690448761, "rewards/code_reward/std": 0.3664725124835968, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 230.00390625, "completions/mean_terminated_length": 230.00390625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.55863921217547, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3359116254801687, "kl": 0.1748046875, "learning_rate": 1.985783123785774e-06, "loss": 0.0082, "num_tokens": 143302769.0, "reward": 0.4365425705909729, "reward_std": 0.06291680037975311, "rewards/code_reward/mean": 0.3373238444328308, "rewards/code_reward/std": 0.35851800441741943, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 229.888671875, "completions/mean_terminated_length": 229.888671875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5604297224709042, "frac_reward_zero_std": 0.40625, "grad_norm": 0.3334317159710415, "kl": 0.18310546875, "learning_rate": 1.9720912837954486e-06, "loss": -0.0083, "num_tokens": 143694912.0, "reward": 0.5597137212753296, "reward_std": 0.0977717787027359, "rewards/code_reward/mean": 0.4604949355125427, "rewards/code_reward/std": 0.39742037653923035, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 239.08984375, "completions/mean_terminated_length": 239.08984375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5622202327663384, "frac_reward_zero_std": 0.40625, "grad_norm": 0.3362625809564693, "kl": 0.175048828125, "learning_rate": 1.958432083296862e-06, "loss": 0.0149, "num_tokens": 144132054.0, "reward": 0.5464415550231934, "reward_std": 0.07111264020204544, "rewards/code_reward/mean": 0.4476134181022644, "rewards/code_reward/std": 0.3849104344844818, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 244.61328125, "completions/mean_terminated_length": 244.61328125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.5640107430617726, "frac_reward_zero_std": 0.296875, "grad_norm": 0.348755798564054, "kl": 0.17626953125, "learning_rate": 1.9448060954027093e-06, "loss": 0.0035, "num_tokens": 144549168.0, "reward": 0.4561070203781128, "reward_std": 0.09919235110282898, "rewards/code_reward/mean": 0.35669296979904175, "rewards/code_reward/std": 0.3519606590270996, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 252.70703125, "completions/mean_terminated_length": 252.70703125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5658012533572068, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3325720406921214, "kl": 0.171630859375, "learning_rate": 1.931213891832153e-06, "loss": 0.0208, "num_tokens": 144965634.0, "reward": 0.527320146560669, "reward_std": 0.10746044665575027, "rewards/code_reward/mean": 0.4279060661792755, "rewards/code_reward/std": 0.38416796922683716, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 252.701171875, "completions/mean_terminated_length": 252.701171875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.567591763652641, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3633704309121459, "kl": 0.170166015625, "learning_rate": 1.9176560428868336e-06, "loss": 0.0298, "num_tokens": 145433497.0, "reward": 0.4933386743068695, "reward_std": 0.11941998451948166, "rewards/code_reward/mean": 0.3937292695045471, "rewards/code_reward/std": 0.3744457960128784, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 246.80859375, "completions/mean_terminated_length": 246.80859375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.5693822739480752, "frac_reward_zero_std": 0.203125, "grad_norm": 0.39179832311317225, "kl": 0.162841796875, "learning_rate": 1.9041331174269373e-06, "loss": 0.0337, "num_tokens": 145887831.0, "reward": 0.47607454657554626, "reward_std": 0.12170256674289703, "rewards/code_reward/mean": 0.37666046619415283, "rewards/code_reward/std": 0.3516838550567627, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 258.392578125, "completions/mean_terminated_length": 258.392578125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5711727842435094, "frac_reward_zero_std": 0.21875, "grad_norm": 0.34221238286126204, "kl": 0.153564453125, "learning_rate": 1.8906456828473341e-06, "loss": 0.0327, "num_tokens": 146323144.0, "reward": 0.5216948390007019, "reward_std": 0.10577403753995895, "rewards/code_reward/mean": 0.42189013957977295, "rewards/code_reward/std": 0.37162524461746216, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 253.501953125, "completions/mean_terminated_length": 253.501953125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5729632945389436, "frac_reward_zero_std": 0.421875, "grad_norm": 0.28810722419499696, "kl": 0.166259765625, "learning_rate": 1.8771943050537656e-06, "loss": 0.0158, "num_tokens": 146747329.0, "reward": 0.49245792627334595, "reward_std": 0.08816514164209366, "rewards/code_reward/mean": 0.3930438756942749, "rewards/code_reward/std": 0.3854901194572449, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 269.103515625, "completions/mean_terminated_length": 269.103515625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.5747538048343778, "frac_reward_zero_std": 0.28125, "grad_norm": 0.34684393248658085, "kl": 0.153076171875, "learning_rate": 1.8637795484391046e-06, "loss": 0.021, "num_tokens": 147186790.0, "reward": 0.3885706067085266, "reward_std": 0.09340377897024155, "rewards/code_reward/mean": 0.2899377942085266, "rewards/code_reward/std": 0.36297503113746643, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 249.177734375, "completions/mean_terminated_length": 248.09393310546875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.576544315129812, "frac_reward_zero_std": 0.34375, "grad_norm": 31.254772125744452, "kl": 17.26220703125, "learning_rate": 1.8504019758596698e-06, "loss": 0.1689, "num_tokens": 147640777.0, "reward": 0.5415544509887695, "reward_std": 0.07854337990283966, "rewards/code_reward/mean": 0.4415544271469116, "rewards/code_reward/std": 0.3774340748786926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 263.48046875, "completions/mean_terminated_length": 263.48046875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5783348254252462, "frac_reward_zero_std": 0.34375, "grad_norm": 0.31370363050996636, "kl": 0.164306640625, "learning_rate": 1.8370621486116163e-06, "loss": -0.0016, "num_tokens": 148097543.0, "reward": 0.5864155292510986, "reward_std": 0.10238587111234665, "rewards/code_reward/mean": 0.4864155054092407, "rewards/code_reward/std": 0.3874208331108093, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 259.3125, "completions/mean_terminated_length": 259.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5801253357206804, "frac_reward_zero_std": 0.375, "grad_norm": 0.30893700522871076, "kl": 0.163330078125, "learning_rate": 1.823760626407377e-06, "loss": 0.0033, "num_tokens": 148541815.0, "reward": 0.5297710299491882, "reward_std": 0.0839281678199768, "rewards/code_reward/mean": 0.4307475984096527, "rewards/code_reward/std": 0.37603074312210083, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 258.33984375, "completions/mean_terminated_length": 258.33984375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5819158460161146, "frac_reward_zero_std": 0.265625, "grad_norm": 0.31286372252678063, "kl": 0.16259765625, "learning_rate": 1.8104979673521838e-06, "loss": 0.006, "num_tokens": 148955629.0, "reward": 0.4801298677921295, "reward_std": 0.12337654829025269, "rewards/code_reward/mean": 0.3805204927921295, "rewards/code_reward/std": 0.37292999029159546, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 253.576171875, "completions/mean_terminated_length": 253.576171875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5837063563115488, "frac_reward_zero_std": 0.25, "grad_norm": 0.3438944936688693, "kl": 0.173583984375, "learning_rate": 1.7972747279206482e-06, "loss": 0.0204, "num_tokens": 149391708.0, "reward": 0.5328813791275024, "reward_std": 0.11667675524950027, "rewards/code_reward/mean": 0.4334673285484314, "rewards/code_reward/std": 0.3745468258857727, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 262.939453125, "completions/mean_terminated_length": 262.939453125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.585496866606983, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3429678323726678, "kl": 0.15478515625, "learning_rate": 1.7840914629334122e-06, "loss": 0.0107, "num_tokens": 149823789.0, "reward": 0.5224900841712952, "reward_std": 0.14950862526893616, "rewards/code_reward/mean": 0.4226853847503662, "rewards/code_reward/std": 0.3809688091278076, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 251.033203125, "completions/mean_terminated_length": 251.033203125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5872873769024172, "frac_reward_zero_std": 0.3125, "grad_norm": 0.34090476813243176, "kl": 0.1806640625, "learning_rate": 1.7709487255338731e-06, "loss": 0.0054, "num_tokens": 150275638.0, "reward": 0.5350677967071533, "reward_std": 0.11734461784362793, "rewards/code_reward/mean": 0.43565377593040466, "rewards/code_reward/std": 0.36978358030319214, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 252.4921875, "completions/mean_terminated_length": 252.4921875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5890778871978514, "frac_reward_zero_std": 0.234375, "grad_norm": 0.337655047860418, "kl": 0.173095703125, "learning_rate": 1.7578470671649684e-06, "loss": -0.0041, "num_tokens": 150713954.0, "reward": 0.45520615577697754, "reward_std": 0.09433227777481079, "rewards/code_reward/mean": 0.35520613193511963, "rewards/code_reward/std": 0.3572109341621399, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 265.20703125, "completions/mean_terminated_length": 265.20703125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5908683974932856, "frac_reward_zero_std": 0.359375, "grad_norm": 0.3267321723323942, "kl": 0.15478515625, "learning_rate": 1.744787037546045e-06, "loss": 0.0091, "num_tokens": 151151324.0, "reward": 0.49904054403305054, "reward_std": 0.10080684721469879, "rewards/code_reward/mean": 0.4002124071121216, "rewards/code_reward/std": 0.3521833121776581, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 246.890625, "completions/mean_terminated_length": 246.890625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5926589077887198, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3428330287976236, "kl": 0.164794921875, "learning_rate": 1.731769184649788e-06, "loss": 0.0218, "num_tokens": 151587508.0, "reward": 0.5454877614974976, "reward_std": 0.09102489054203033, "rewards/code_reward/mean": 0.44548773765563965, "rewards/code_reward/std": 0.38414865732192993, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 265.09765625, "completions/mean_terminated_length": 261.6086120605469, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.594449418084154, "frac_reward_zero_std": 0.359375, "grad_norm": 0.31798247620642944, "kl": 0.166748046875, "learning_rate": 1.7187940546792325e-06, "loss": 0.0336, "num_tokens": 152063126.0, "reward": 0.5245329737663269, "reward_std": 0.09518253803253174, "rewards/code_reward/mean": 0.42550957202911377, "rewards/code_reward/std": 0.3837074339389801, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 261.201171875, "completions/mean_terminated_length": 261.201171875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5962399283795882, "frac_reward_zero_std": 0.3125, "grad_norm": 0.33579016200932993, "kl": 0.16357421875, "learning_rate": 1.7058621920448465e-06, "loss": 0.0161, "num_tokens": 152500957.0, "reward": 0.4646655321121216, "reward_std": 0.09194419533014297, "rewards/code_reward/mean": 0.36525148153305054, "rewards/code_reward/std": 0.3535217046737671, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 254.1484375, "completions/mean_terminated_length": 254.1484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.5980304386750224, "frac_reward_zero_std": 0.375, "grad_norm": 0.3081178885846835, "kl": 0.169677734375, "learning_rate": 1.6929741393416855e-06, "loss": -0.0057, "num_tokens": 152962497.0, "reward": 0.4542083740234375, "reward_std": 0.07877926528453827, "rewards/code_reward/mean": 0.35479432344436646, "rewards/code_reward/std": 0.35485246777534485, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 253.560546875, "completions/mean_terminated_length": 253.560546875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5998209489704566, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3428410764116086, "kl": 0.169677734375, "learning_rate": 1.6801304373266286e-06, "loss": 0.0117, "num_tokens": 153388624.0, "reward": 0.5579234957695007, "reward_std": 0.10708152502775192, "rewards/code_reward/mean": 0.4585094451904297, "rewards/code_reward/std": 0.3984989523887634, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 261.689453125, "completions/mean_terminated_length": 261.689453125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6016114592658908, "frac_reward_zero_std": 0.296875, "grad_norm": 0.36729675415934254, "kl": 0.161376953125, "learning_rate": 1.667331624895689e-06, "loss": 0.0123, "num_tokens": 153845513.0, "reward": 0.44786205887794495, "reward_std": 0.10820616036653519, "rewards/code_reward/mean": 0.3480573892593384, "rewards/code_reward/std": 0.35952457785606384, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 255.623046875, "completions/mean_terminated_length": 255.623046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.603401969561325, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3281844654713104, "kl": 0.16259765625, "learning_rate": 1.6545782390614037e-06, "loss": -0.0076, "num_tokens": 154273256.0, "reward": 0.5458970069885254, "reward_std": 0.11055189371109009, "rewards/code_reward/mean": 0.4464830160140991, "rewards/code_reward/std": 0.36546242237091064, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 250.130859375, "completions/mean_terminated_length": 250.130859375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6051924798567592, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3596640644855443, "kl": 0.167724609375, "learning_rate": 1.6418708149302992e-06, "loss": 0.032, "num_tokens": 154696907.0, "reward": 0.4649273753166199, "reward_std": 0.10739079862833023, "rewards/code_reward/mean": 0.36590397357940674, "rewards/code_reward/std": 0.38660863041877747, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 260.212890625, "completions/mean_terminated_length": 260.212890625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6069829901521934, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3370057273805934, "kl": 0.1630859375, "learning_rate": 1.6292098856804423e-06, "loss": -0.0091, "num_tokens": 155147576.0, "reward": 0.5444281101226807, "reward_std": 0.12403807789087296, "rewards/code_reward/mean": 0.4455999433994293, "rewards/code_reward/std": 0.3934081792831421, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 251.1640625, "completions/mean_terminated_length": 251.1640625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.6087735004476276, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3697737516722685, "kl": 0.166259765625, "learning_rate": 1.6165959825390661e-06, "loss": -0.0053, "num_tokens": 155547108.0, "reward": 0.4673112630844116, "reward_std": 0.11813505738973618, "rewards/code_reward/mean": 0.36809247732162476, "rewards/code_reward/std": 0.34190526604652405, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 252.1015625, "completions/mean_terminated_length": 252.1015625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.6105640107430618, "frac_reward_zero_std": 0.296875, "grad_norm": 0.34138931554690133, "kl": 0.172119140625, "learning_rate": 1.604029634760284e-06, "loss": 0.0314, "num_tokens": 155969624.0, "reward": 0.5247693061828613, "reward_std": 0.09996587783098221, "rewards/code_reward/mean": 0.4267224669456482, "rewards/code_reward/std": 0.3673737943172455, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 262.5078125, "completions/mean_terminated_length": 262.5078125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.612354521038496, "frac_reward_zero_std": 0.3125, "grad_norm": 0.33787588989378636, "kl": 0.158447265625, "learning_rate": 1.59151136960288e-06, "loss": 0.0324, "num_tokens": 156394236.0, "reward": 0.457765132188797, "reward_std": 0.10225236415863037, "rewards/code_reward/mean": 0.3585463762283325, "rewards/code_reward/std": 0.3531979024410248, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 278.158203125, "completions/mean_terminated_length": 278.158203125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6141450313339302, "frac_reward_zero_std": 0.328125, "grad_norm": 0.33210818381955, "kl": 0.164794921875, "learning_rate": 1.5790417123081903e-06, "loss": 0.0241, "num_tokens": 156834893.0, "reward": 0.5492209196090698, "reward_std": 0.10351793467998505, "rewards/code_reward/mean": 0.4492208957672119, "rewards/code_reward/std": 0.39261409640312195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 271.728515625, "completions/mean_terminated_length": 271.728515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6159355416293644, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3354115749157992, "kl": 0.156982421875, "learning_rate": 1.5666211860780583e-06, "loss": 0.01, "num_tokens": 157286818.0, "reward": 0.5849733352661133, "reward_std": 0.08943741768598557, "rewards/code_reward/mean": 0.48555925488471985, "rewards/code_reward/std": 0.37248480319976807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 269.8203125, "completions/mean_terminated_length": 269.8203125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6177260519247986, "frac_reward_zero_std": 0.1875, "grad_norm": 0.34924467782790897, "kl": 0.174560546875, "learning_rate": 1.5542503120528918e-06, "loss": 0.0208, "num_tokens": 157746526.0, "reward": 0.4512224495410919, "reward_std": 0.10372377932071686, "rewards/code_reward/mean": 0.3518083989620209, "rewards/code_reward/std": 0.34128257632255554, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 269.796875, "completions/mean_terminated_length": 266.3170166015625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6195165622202328, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3256400934023469, "kl": 0.156005859375, "learning_rate": 1.5419296092897866e-06, "loss": 0.0138, "num_tokens": 158196438.0, "reward": 0.4688138961791992, "reward_std": 0.08863537758588791, "rewards/code_reward/mean": 0.3692045211791992, "rewards/code_reward/std": 0.36825183033943176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 275.064453125, "completions/mean_terminated_length": 275.064453125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.621307072515667, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3164216484840336, "kl": 0.171142578125, "learning_rate": 1.529659594740755e-06, "loss": 0.0124, "num_tokens": 158680415.0, "reward": 0.5166200995445251, "reward_std": 0.09467747807502747, "rewards/code_reward/mean": 0.41701066493988037, "rewards/code_reward/std": 0.38109537959098816, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 277.111328125, "completions/mean_terminated_length": 277.111328125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6230975828111012, "frac_reward_zero_std": 0.34375, "grad_norm": 0.32044262883263075, "kl": 0.177001953125, "learning_rate": 1.5174407832310338e-06, "loss": 0.0169, "num_tokens": 159123112.0, "reward": 0.4757963716983795, "reward_std": 0.10352632403373718, "rewards/code_reward/mean": 0.37599167227745056, "rewards/code_reward/std": 0.3537483811378479, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 278.60546875, "completions/mean_terminated_length": 278.60546875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6248880931065354, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3572471095507143, "kl": 0.1552734375, "learning_rate": 1.5052736874374815e-06, "loss": -0.0031, "num_tokens": 159579174.0, "reward": 0.5212893486022949, "reward_std": 0.12779279053211212, "rewards/code_reward/mean": 0.4224611818790436, "rewards/code_reward/std": 0.38893061876296997, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 269.44921875, "completions/mean_terminated_length": 269.44921875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6266786034019696, "frac_reward_zero_std": 0.296875, "grad_norm": 0.33376281096755667, "kl": 0.169189453125, "learning_rate": 1.4931588178670695e-06, "loss": 0.0114, "num_tokens": 160015548.0, "reward": 0.5164725184440613, "reward_std": 0.07941782474517822, "rewards/code_reward/mean": 0.41705840826034546, "rewards/code_reward/std": 0.3816792666912079, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 278.591796875, "completions/mean_terminated_length": 278.591796875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.6284691136974038, "frac_reward_zero_std": 0.265625, "grad_norm": 0.32464114679509315, "kl": 0.154296875, "learning_rate": 1.4810966828354605e-06, "loss": 0.0235, "num_tokens": 160439723.0, "reward": 0.4422021806240082, "reward_std": 0.10568234324455261, "rewards/code_reward/mean": 0.34317871928215027, "rewards/code_reward/std": 0.3573067784309387, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 289.91015625, "completions/mean_terminated_length": 289.91015625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.630259623992838, "frac_reward_zero_std": 0.265625, "grad_norm": 0.30609987755476703, "kl": 0.1630859375, "learning_rate": 1.469087788445684e-06, "loss": 0.0007, "num_tokens": 160875037.0, "reward": 0.5177447199821472, "reward_std": 0.10866422206163406, "rewards/code_reward/mean": 0.41950252652168274, "rewards/code_reward/std": 0.37049955129623413, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 299.3359375, "completions/mean_terminated_length": 299.3359375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6320501342882722, "frac_reward_zero_std": 0.375, "grad_norm": 0.28664854011044005, "kl": 0.15869140625, "learning_rate": 1.4571326385668965e-06, "loss": 0.0239, "num_tokens": 161346673.0, "reward": 0.5187546014785767, "reward_std": 0.10010634362697601, "rewards/code_reward/mean": 0.4191451966762543, "rewards/code_reward/std": 0.3820086419582367, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 289.9140625, "completions/mean_terminated_length": 289.9140625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6338406445837064, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3525662753357496, "kl": 0.164306640625, "learning_rate": 1.4452317348132434e-06, "loss": 0.0073, "num_tokens": 161800029.0, "reward": 0.4837416410446167, "reward_std": 0.12599599361419678, "rewards/code_reward/mean": 0.38530415296554565, "rewards/code_reward/std": 0.36869585514068604, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 286.498046875, "completions/mean_terminated_length": 286.498046875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6356311548791406, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3301283202260216, "kl": 0.18310546875, "learning_rate": 1.4333855765228104e-06, "loss": 0.0256, "num_tokens": 162277828.0, "reward": 0.4298362135887146, "reward_std": 0.11077988892793655, "rewards/code_reward/mean": 0.3302268385887146, "rewards/code_reward/std": 0.34389829635620117, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 291.48828125, "completions/mean_terminated_length": 291.48828125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6374216651745748, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3281363816237195, "kl": 0.1611328125, "learning_rate": 1.421594660736675e-06, "loss": 0.0271, "num_tokens": 162756158.0, "reward": 0.42322519421577454, "reward_std": 0.09830034524202347, "rewards/code_reward/mean": 0.3238111436367035, "rewards/code_reward/std": 0.352096825838089, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 300.306640625, "completions/mean_terminated_length": 300.306640625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.639212175470009, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3246668864020803, "kl": 0.166748046875, "learning_rate": 1.4098594821780476e-06, "loss": 0.0233, "num_tokens": 163211635.0, "reward": 0.46630656719207764, "reward_std": 0.1004958525300026, "rewards/code_reward/mean": 0.3674784302711487, "rewards/code_reward/std": 0.36566826701164246, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 292.28125, "completions/mean_terminated_length": 292.28125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6410026857654432, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3335331399432968, "kl": 0.173583984375, "learning_rate": 1.3981805332315174e-06, "loss": -0.0006, "num_tokens": 163670411.0, "reward": 0.4827768802642822, "reward_std": 0.10180593281984329, "rewards/code_reward/mean": 0.3833628296852112, "rewards/code_reward/std": 0.34714001417160034, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 302.34765625, "completions/mean_terminated_length": 302.34765625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6427931960608774, "frac_reward_zero_std": 0.265625, "grad_norm": 0.30599064217606003, "kl": 0.156494140625, "learning_rate": 1.3865583039223929e-06, "loss": 0.016, "num_tokens": 164142285.0, "reward": 0.5053516030311584, "reward_std": 0.12150374054908752, "rewards/code_reward/mean": 0.4065234959125519, "rewards/code_reward/std": 0.33841168880462646, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 280.4453125, "completions/mean_terminated_length": 280.4453125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6445837063563116, "frac_reward_zero_std": 0.375, "grad_norm": 0.29857103509396066, "kl": 0.160400390625, "learning_rate": 1.374993281896137e-06, "loss": 0.0154, "num_tokens": 164568953.0, "reward": 0.5624741315841675, "reward_std": 0.10268101096153259, "rewards/code_reward/mean": 0.4634506404399872, "rewards/code_reward/std": 0.3811890482902527, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 292.47265625, "completions/mean_terminated_length": 289.03717041015625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6463742166517458, "frac_reward_zero_std": 0.25, "grad_norm": 0.31002420629805627, "kl": 0.153076171875, "learning_rate": 1.3634859523979134e-06, "loss": 0.0243, "num_tokens": 165015355.0, "reward": 0.5296179056167603, "reward_std": 0.12728886306285858, "rewards/code_reward/mean": 0.4311804175376892, "rewards/code_reward/std": 0.37302082777023315, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 305.298828125, "completions/mean_terminated_length": 305.298828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.64816472694718, "frac_reward_zero_std": 0.234375, "grad_norm": 0.30120073745773723, "kl": 0.162109375, "learning_rate": 1.3520367982522208e-06, "loss": 0.0219, "num_tokens": 165488220.0, "reward": 0.4850338399410248, "reward_std": 0.1142628937959671, "rewards/code_reward/mean": 0.38659635186195374, "rewards/code_reward/std": 0.36400556564331055, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 285.5390625, "completions/mean_terminated_length": 285.5390625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.6499552372426142, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3082898359850464, "kl": 0.159423828125, "learning_rate": 1.3406462998426358e-06, "loss": -0.0145, "num_tokens": 165947672.0, "reward": 0.5653737783432007, "reward_std": 0.1008816659450531, "rewards/code_reward/mean": 0.4655691385269165, "rewards/code_reward/std": 0.37211814522743225, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 284.376953125, "completions/mean_terminated_length": 280.9256286621094, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6517457475380484, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3436608653899491, "kl": 0.1591796875, "learning_rate": 1.3293149350916595e-06, "loss": 0.0224, "num_tokens": 166385057.0, "reward": 0.5994518399238586, "reward_std": 0.12821917235851288, "rewards/code_reward/mean": 0.500428318977356, "rewards/code_reward/std": 0.40046557784080505, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 289.017578125, "completions/mean_terminated_length": 289.017578125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.6535362578334826, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3542984268530134, "kl": 0.16357421875, "learning_rate": 1.3180431794406623e-06, "loss": 0.0117, "num_tokens": 166832474.0, "reward": 0.4515675902366638, "reward_std": 0.1027330756187439, "rewards/code_reward/mean": 0.35234886407852173, "rewards/code_reward/std": 0.36300602555274963, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 291.96484375, "completions/mean_terminated_length": 291.71429443359375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6553267681289168, "frac_reward_zero_std": 0.1875, "grad_norm": 0.35660557639887736, "kl": 0.177978515625, "learning_rate": 1.3068315058299358e-06, "loss": 0.0265, "num_tokens": 167277696.0, "reward": 0.4684012830257416, "reward_std": 0.1260806769132614, "rewards/code_reward/mean": 0.36898723244667053, "rewards/code_reward/std": 0.32607021927833557, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 285.0859375, "completions/mean_terminated_length": 285.0859375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.657117278424351, "frac_reward_zero_std": 0.125, "grad_norm": 0.41801675334317545, "kl": 0.193359375, "learning_rate": 1.2956803846788503e-06, "loss": 0.0148, "num_tokens": 167718660.0, "reward": 0.47322607040405273, "reward_std": 0.1394314020872116, "rewards/code_reward/mean": 0.3738119602203369, "rewards/code_reward/std": 0.34230369329452515, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 273.767578125, "completions/mean_terminated_length": 273.767578125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.6589077887197852, "frac_reward_zero_std": 0.25, "grad_norm": 0.3299423899506483, "kl": 0.1640625, "learning_rate": 1.284590283866116e-06, "loss": 0.0322, "num_tokens": 168154573.0, "reward": 0.5262330174446106, "reward_std": 0.13022397458553314, "rewards/code_reward/mean": 0.42779552936553955, "rewards/code_reward/std": 0.38629141449928284, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 286.798828125, "completions/mean_terminated_length": 286.798828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6606982990152194, "frac_reward_zero_std": 0.234375, "grad_norm": 0.33727579776750194, "kl": 0.15673828125, "learning_rate": 1.2735616687101518e-06, "loss": 0.0316, "num_tokens": 168603086.0, "reward": 0.501867413520813, "reward_std": 0.1156509518623352, "rewards/code_reward/mean": 0.4022579789161682, "rewards/code_reward/std": 0.37697312235832214, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 273.255859375, "completions/mean_terminated_length": 273.255859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6624888093106536, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3539769527200901, "kl": 0.163818359375, "learning_rate": 1.2625950019495614e-06, "loss": 0.0144, "num_tokens": 169040905.0, "reward": 0.5669090747833252, "reward_std": 0.12569212913513184, "rewards/code_reward/mean": 0.4672996997833252, "rewards/code_reward/std": 0.3925403952598572, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 288.3046875, "completions/mean_terminated_length": 288.3046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.6642793196060878, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3330282648808702, "kl": 0.16748046875, "learning_rate": 1.251690743723718e-06, "loss": 0.0223, "num_tokens": 169489549.0, "reward": 0.5209923982620239, "reward_std": 0.09224514663219452, "rewards/code_reward/mean": 0.421968936920166, "rewards/code_reward/std": 0.3694319725036621, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 266.580078125, "completions/mean_terminated_length": 266.580078125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.666069829901522, "frac_reward_zero_std": 0.296875, "grad_norm": 0.318497678987219, "kl": 0.17333984375, "learning_rate": 1.2408493515534581e-06, "loss": 0.0003, "num_tokens": 169939062.0, "reward": 0.4828207194805145, "reward_std": 0.09432366490364075, "rewards/code_reward/mean": 0.38496914505958557, "rewards/code_reward/std": 0.373337060213089, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 282.287109375, "completions/mean_terminated_length": 282.287109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6678603401969562, "frac_reward_zero_std": 0.203125, "grad_norm": 0.33430857359061905, "kl": 0.165771484375, "learning_rate": 1.2300712803218834e-06, "loss": 0.0353, "num_tokens": 170387665.0, "reward": 0.4752040505409241, "reward_std": 0.10460580140352249, "rewards/code_reward/mean": 0.3763759136199951, "rewards/code_reward/std": 0.330888956785202, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 258.61328125, "completions/mean_terminated_length": 258.61328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6696508504923904, "frac_reward_zero_std": 0.265625, "grad_norm": 0.32686141498779164, "kl": 0.164306640625, "learning_rate": 1.2193569822552772e-06, "loss": -0.0069, "num_tokens": 170803387.0, "reward": 0.5560147762298584, "reward_std": 0.1028691902756691, "rewards/code_reward/mean": 0.4577726125717163, "rewards/code_reward/std": 0.3851953446865082, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 264.630859375, "completions/mean_terminated_length": 264.630859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6714413607878246, "frac_reward_zero_std": 0.25, "grad_norm": 0.343446074927442, "kl": 0.1650390625, "learning_rate": 1.2087069069041268e-06, "loss": 0.0022, "num_tokens": 171250478.0, "reward": 0.486378014087677, "reward_std": 0.1135624498128891, "rewards/code_reward/mean": 0.38794052600860596, "rewards/code_reward/std": 0.3557998538017273, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 254.345703125, "completions/mean_terminated_length": 254.345703125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6732318710832588, "frac_reward_zero_std": 0.296875, "grad_norm": 0.33737133523480145, "kl": 0.171142578125, "learning_rate": 1.1981215011242654e-06, "loss": 0.0288, "num_tokens": 171683423.0, "reward": 0.5268545150756836, "reward_std": 0.08505912125110626, "rewards/code_reward/mean": 0.42841699719429016, "rewards/code_reward/std": 0.3646799325942993, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 266.4609375, "completions/mean_terminated_length": 266.4609375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.675022381378693, "frac_reward_zero_std": 0.25, "grad_norm": 0.32205555799650626, "kl": 0.15625, "learning_rate": 1.1876012090581184e-06, "loss": 0.0151, "num_tokens": 172100531.0, "reward": 0.509934663772583, "reward_std": 0.11905954033136368, "rewards/code_reward/mean": 0.41208308935165405, "rewards/code_reward/std": 0.3554818034172058, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 276.37109375, "completions/mean_terminated_length": 276.37109375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.6768128916741272, "frac_reward_zero_std": 0.25, "grad_norm": 0.3551130532394289, "kl": 0.18896484375, "learning_rate": 1.177146472116071e-06, "loss": 0.0045, "num_tokens": 172553345.0, "reward": 0.47903525829315186, "reward_std": 0.11595383286476135, "rewards/code_reward/mean": 0.3805977404117584, "rewards/code_reward/std": 0.36225563287734985, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 261.451171875, "completions/mean_terminated_length": 261.451171875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6786034019695614, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3448186811453637, "kl": 0.168701171875, "learning_rate": 1.1667577289579462e-06, "loss": 0.0227, "num_tokens": 172976384.0, "reward": 0.5395588278770447, "reward_std": 0.133894145488739, "rewards/code_reward/mean": 0.44170722365379333, "rewards/code_reward/std": 0.3928757905960083, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 255.544921875, "completions/mean_terminated_length": 255.544921875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.6803939122649956, "frac_reward_zero_std": 0.265625, "grad_norm": 0.34748719468007955, "kl": 0.185546875, "learning_rate": 1.1564354154746007e-06, "loss": 0.0058, "num_tokens": 173376639.0, "reward": 0.49575555324554443, "reward_std": 0.11018548905849457, "rewards/code_reward/mean": 0.3982946276664734, "rewards/code_reward/std": 0.38791990280151367, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 253.060546875, "completions/mean_terminated_length": 253.060546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.6821844225604298, "frac_reward_zero_std": 0.296875, "grad_norm": 0.33983491252931075, "kl": 0.16552734375, "learning_rate": 1.146179964769635e-06, "loss": 0.014, "num_tokens": 173820966.0, "reward": 0.4616347849369049, "reward_std": 0.08114675432443619, "rewards/code_reward/mean": 0.36280667781829834, "rewards/code_reward/std": 0.3501991629600525, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 251.419921875, "completions/mean_terminated_length": 251.419921875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.683974932855864, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3405452918109619, "kl": 0.165771484375, "learning_rate": 1.1359918071412195e-06, "loss": 0.0138, "num_tokens": 174276357.0, "reward": 0.5084632039070129, "reward_std": 0.14230848848819733, "rewards/code_reward/mean": 0.40924444794654846, "rewards/code_reward/std": 0.3795515298843384, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 269.041015625, "completions/mean_terminated_length": 269.041015625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6857654431512982, "frac_reward_zero_std": 0.171875, "grad_norm": 0.3342590347123932, "kl": 0.15625, "learning_rate": 1.1258713700640456e-06, "loss": 0.0172, "num_tokens": 174714890.0, "reward": 0.49692559242248535, "reward_std": 0.1449463963508606, "rewards/code_reward/mean": 0.39790210127830505, "rewards/code_reward/std": 0.3455314636230469, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 273.49609375, "completions/mean_terminated_length": 273.49609375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.6875559534467324, "frac_reward_zero_std": 0.234375, "grad_norm": 0.32423688023424907, "kl": 0.16162109375, "learning_rate": 1.115819078171383e-06, "loss": 0.0159, "num_tokens": 175164472.0, "reward": 0.48606646060943604, "reward_std": 0.12042847275733948, "rewards/code_reward/mean": 0.38743361830711365, "rewards/code_reward/std": 0.3694130778312683, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 254.455078125, "completions/mean_terminated_length": 254.455078125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.6893464637421666, "frac_reward_zero_std": 0.296875, "grad_norm": 0.35993427029071995, "kl": 0.1611328125, "learning_rate": 1.1058353532372667e-06, "loss": 0.0106, "num_tokens": 175591137.0, "reward": 0.3949888348579407, "reward_std": 0.0829850435256958, "rewards/code_reward/mean": 0.2949887812137604, "rewards/code_reward/std": 0.31740206480026245, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6911369740376008, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3316647493625365, "kl": 0.15771484375, "learning_rate": 1.0959206141587998e-06, "loss": 0.0208, "num_tokens": 176024697.0, "reward": 0.5806926488876343, "reward_std": 0.09607809782028198, "rewards/code_reward/mean": 0.48166918754577637, "rewards/code_reward/std": 0.4003455340862274, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6929274843330349, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3167003130073196, "kl": 0.15576171875, "learning_rate": 1.0860752769385766e-06, "loss": 0.0105, "num_tokens": 176499785.0, "reward": 0.49886220693588257, "reward_std": 0.11777278780937195, "rewards/code_reward/mean": 0.3994481563568115, "rewards/code_reward/std": 0.38102516531944275, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 263.20703125, "completions/mean_terminated_length": 263.20703125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.6947179946284691, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3480931930970332, "kl": 0.155029296875, "learning_rate": 1.0762997546672279e-06, "loss": -0.0144, "num_tokens": 176947619.0, "reward": 0.541716456413269, "reward_std": 0.1321731060743332, "rewards/code_reward/mean": 0.4417165219783783, "rewards/code_reward/std": 0.36072513461112976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 272.146484375, "completions/mean_terminated_length": 271.65167236328125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.6965085049239033, "frac_reward_zero_std": 0.203125, "grad_norm": 0.381207173643862, "kl": 0.162841796875, "learning_rate": 1.0665944575060914e-06, "loss": -0.0029, "num_tokens": 177406926.0, "reward": 0.43620550632476807, "reward_std": 0.10151318460702896, "rewards/code_reward/mean": 0.3373773694038391, "rewards/code_reward/std": 0.3224928081035614, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 283.2421875, "completions/mean_terminated_length": 283.2421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6982990152193375, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3367958262874525, "kl": 0.16845703125, "learning_rate": 1.056959792669997e-06, "loss": 0.0124, "num_tokens": 177857666.0, "reward": 0.4412127733230591, "reward_std": 0.08164721727371216, "rewards/code_reward/mean": 0.34218931198120117, "rewards/code_reward/std": 0.3393513560295105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 276.91796875, "completions/mean_terminated_length": 276.91796875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7000895255147717, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3112082079014226, "kl": 0.150146484375, "learning_rate": 1.0473961644101856e-06, "loss": 0.0046, "num_tokens": 178271624.0, "reward": 0.4946806728839874, "reward_std": 0.13418129086494446, "rewards/code_reward/mean": 0.39546188712120056, "rewards/code_reward/std": 0.39124324917793274, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 255.232421875, "completions/mean_terminated_length": 255.232421875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7018800358102059, "frac_reward_zero_std": 0.328125, "grad_norm": 0.33613066557412713, "kl": 0.183349609375, "learning_rate": 1.037903973997345e-06, "loss": 0.0053, "num_tokens": 178721351.0, "reward": 0.5151516199111938, "reward_std": 0.10684648156166077, "rewards/code_reward/mean": 0.41554221510887146, "rewards/code_reward/std": 0.37515121698379517, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 267.888671875, "completions/mean_terminated_length": 267.888671875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7036705461056401, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3662624197805354, "kl": 0.185546875, "learning_rate": 1.0284836197047737e-06, "loss": 0.0075, "num_tokens": 179156278.0, "reward": 0.5455970764160156, "reward_std": 0.11772000789642334, "rewards/code_reward/mean": 0.44637832045555115, "rewards/code_reward/std": 0.382731556892395, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 269.697265625, "completions/mean_terminated_length": 269.697265625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7054610564010743, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3283893621491066, "kl": 0.16552734375, "learning_rate": 1.0191354967916712e-06, "loss": 0.012, "num_tokens": 179593851.0, "reward": 0.48321235179901123, "reward_std": 0.12693935632705688, "rewards/code_reward/mean": 0.3843841850757599, "rewards/code_reward/std": 0.38849782943725586, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 277.17578125, "completions/mean_terminated_length": 277.17578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7072515666965085, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3373899300089672, "kl": 0.156982421875, "learning_rate": 1.0098599974865515e-06, "loss": -0.0079, "num_tokens": 180025069.0, "reward": 0.506369948387146, "reward_std": 0.12756794691085815, "rewards/code_reward/mean": 0.4071512222290039, "rewards/code_reward/std": 0.3677828907966614, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 301.90625, "completions/mean_terminated_length": 291.61492919921875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7090420769919427, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3173034818642803, "kl": 0.16162109375, "learning_rate": 1.0006575109707898e-06, "loss": 0.0182, "num_tokens": 180540493.0, "reward": 0.4626311957836151, "reward_std": 0.13469013571739197, "rewards/code_reward/mean": 0.3649749159812927, "rewards/code_reward/std": 0.355493426322937, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 271.904296875, "completions/mean_terminated_length": 271.904296875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7108325872873769, "frac_reward_zero_std": 0.140625, "grad_norm": 0.3568191176877626, "kl": 0.160400390625, "learning_rate": 9.915284233622877e-07, "loss": 0.0189, "num_tokens": 180979804.0, "reward": 0.4569905996322632, "reward_std": 0.11931423842906952, "rewards/code_reward/mean": 0.3583577573299408, "rewards/code_reward/std": 0.33016687631607056, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 292.533203125, "completions/mean_terminated_length": 292.533203125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7126230975828111, "frac_reward_zero_std": 0.140625, "grad_norm": 0.3366655623799135, "kl": 0.154541015625, "learning_rate": 9.824731176992796e-07, "loss": 0.0001, "num_tokens": 181476069.0, "reward": 0.5504260659217834, "reward_std": 0.14415885508060455, "rewards/code_reward/mean": 0.4515979290008545, "rewards/code_reward/std": 0.3584127128124237, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 292.3125, "completions/mean_terminated_length": 288.876708984375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7144136078782453, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3115455588658028, "kl": 0.16162109375, "learning_rate": 9.734919739242543e-07, "loss": 0.0274, "num_tokens": 181942869.0, "reward": 0.5985906720161438, "reward_std": 0.11552190780639648, "rewards/code_reward/mean": 0.5001531839370728, "rewards/code_reward/std": 0.3852982223033905, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 304.51953125, "completions/mean_terminated_length": 303.60076904296875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.7162041181736795, "frac_reward_zero_std": 0.265625, "grad_norm": 0.6827052992020021, "kl": 0.5830078125, "learning_rate": 9.645853688680177e-07, "loss": 0.0354, "num_tokens": 182403039.0, "reward": 0.5053490400314331, "reward_std": 0.12097816914319992, "rewards/code_reward/mean": 0.4063255488872528, "rewards/code_reward/std": 0.37412333488464355, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 309.970703125, "completions/mean_terminated_length": 309.970703125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7179946284691137, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3295330761961959, "kl": 0.150634765625, "learning_rate": 9.557536762338786e-07, "loss": 0.0168, "num_tokens": 182890856.0, "reward": 0.5114704370498657, "reward_std": 0.12499460577964783, "rewards/code_reward/mean": 0.4124469757080078, "rewards/code_reward/std": 0.3553789258003235, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 306.08203125, "completions/mean_terminated_length": 306.08203125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7197851387645479, "frac_reward_zero_std": 0.25, "grad_norm": 0.3206404327908809, "kl": 0.143798828125, "learning_rate": 9.46997266581973e-07, "loss": 0.0303, "num_tokens": 183345562.0, "reward": 0.4812077283859253, "reward_std": 0.09828542917966843, "rewards/code_reward/mean": 0.3814030587673187, "rewards/code_reward/std": 0.359371155500412, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 296.34375, "completions/mean_terminated_length": 296.34375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7215756490599821, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3136101216971687, "kl": 0.161865234375, "learning_rate": 9.383165073137115e-07, "loss": 0.0015, "num_tokens": 183788610.0, "reward": 0.43663638830184937, "reward_std": 0.07853668183088303, "rewards/code_reward/mean": 0.33800360560417175, "rewards/code_reward/std": 0.35945454239845276, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 288.88671875, "completions/mean_terminated_length": 288.88671875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7233661593554163, "frac_reward_zero_std": 0.28125, "grad_norm": 0.31536521678592044, "kl": 0.156982421875, "learning_rate": 9.297117626563687e-07, "loss": 0.0258, "num_tokens": 184251528.0, "reward": 0.5221993923187256, "reward_std": 0.12001742422580719, "rewards/code_reward/mean": 0.4229806661605835, "rewards/code_reward/std": 0.3674774169921875, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 302.96484375, "completions/mean_terminated_length": 302.96484375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7251566696508505, "frac_reward_zero_std": 0.25, "grad_norm": 0.32866345608034325, "kl": 0.15771484375, "learning_rate": 9.211833936477957e-07, "loss": 0.0233, "num_tokens": 184729686.0, "reward": 0.526389479637146, "reward_std": 0.1271475851535797, "rewards/code_reward/mean": 0.42717069387435913, "rewards/code_reward/std": 0.38876694440841675, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 309.904296875, "completions/mean_terminated_length": 309.904296875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7269471799462847, "frac_reward_zero_std": 0.25, "grad_norm": 0.32971934256691277, "kl": 0.146728515625, "learning_rate": 9.127317581212753e-07, "loss": 0.0176, "num_tokens": 185230845.0, "reward": 0.4583035111427307, "reward_std": 0.11482838541269302, "rewards/code_reward/mean": 0.3592800796031952, "rewards/code_reward/std": 0.3700915277004242, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 304.619140625, "completions/mean_terminated_length": 301.2074279785156, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7287376902417189, "frac_reward_zero_std": 0.25, "grad_norm": 0.30213866747233215, "kl": 0.151611328125, "learning_rate": 9.043572106905084e-07, "loss": 0.0292, "num_tokens": 185701666.0, "reward": 0.41433510184288025, "reward_std": 0.11389996111392975, "rewards/code_reward/mean": 0.3149210214614868, "rewards/code_reward/std": 0.3246167004108429, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 289.515625, "completions/mean_terminated_length": 286.0743713378906, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7305282005371531, "frac_reward_zero_std": 0.1875, "grad_norm": 0.32267979786095274, "kl": 0.158935546875, "learning_rate": 8.960601027347321e-07, "loss": 0.027, "num_tokens": 186151322.0, "reward": 0.49915528297424316, "reward_std": 0.14736242592334747, "rewards/code_reward/mean": 0.39974117279052734, "rewards/code_reward/std": 0.3728961646556854, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 298.0234375, "completions/mean_terminated_length": 298.0234375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.7323187108325873, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3267380231255312, "kl": 0.177490234375, "learning_rate": 8.878407823839788e-07, "loss": 0.0374, "num_tokens": 186613462.0, "reward": 0.5384736061096191, "reward_std": 0.1009739339351654, "rewards/code_reward/mean": 0.4396454989910126, "rewards/code_reward/std": 0.38676124811172485, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 296.9609375, "completions/mean_terminated_length": 296.381591796875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7341092211280215, "frac_reward_zero_std": 0.203125, "grad_norm": 0.4612241389681678, "kl": 0.203857421875, "learning_rate": 8.796995945044689e-07, "loss": 0.016, "num_tokens": 187057186.0, "reward": 0.45203346014022827, "reward_std": 0.1315121054649353, "rewards/code_reward/mean": 0.3532053232192993, "rewards/code_reward/std": 0.3531695604324341, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 298.4609375, "completions/mean_terminated_length": 298.4609375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7358997314234557, "frac_reward_zero_std": 0.265625, "grad_norm": 0.31229359165911336, "kl": 0.15234375, "learning_rate": 8.716368806841405e-07, "loss": 0.0223, "num_tokens": 187511990.0, "reward": 0.553558886051178, "reward_std": 0.12895874679088593, "rewards/code_reward/mean": 0.4543401300907135, "rewards/code_reward/std": 0.37593576312065125, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 324.986328125, "completions/mean_terminated_length": 324.986328125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.7376902417188899, "frac_reward_zero_std": 0.25, "grad_norm": 0.30206839363450727, "kl": 0.158447265625, "learning_rate": 8.636529792183171e-07, "loss": 0.0267, "num_tokens": 187988367.0, "reward": 0.5640175342559814, "reward_std": 0.1312108039855957, "rewards/code_reward/mean": 0.4647987484931946, "rewards/code_reward/std": 0.3985426425933838, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 281.29296875, "completions/mean_terminated_length": 281.29296875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7394807520143241, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3493104399791931, "kl": 0.16259765625, "learning_rate": 8.557482250955144e-07, "loss": 0.0188, "num_tokens": 188453237.0, "reward": 0.5384122729301453, "reward_std": 0.1037464365363121, "rewards/code_reward/mean": 0.4389982223510742, "rewards/code_reward/std": 0.38801929354667664, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 273.94921875, "completions/mean_terminated_length": 273.94921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7412712623097583, "frac_reward_zero_std": 0.3125, "grad_norm": 0.313275543322612, "kl": 0.15576171875, "learning_rate": 8.479229499833844e-07, "loss": 0.0015, "num_tokens": 188894859.0, "reward": 0.5159948468208313, "reward_std": 0.10658752918243408, "rewards/code_reward/mean": 0.41716670989990234, "rewards/code_reward/std": 0.3775886297225952, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 282.865234375, "completions/mean_terminated_length": 282.865234375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7430617726051925, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3457903466739407, "kl": 0.154052734375, "learning_rate": 8.401774822147976e-07, "loss": 0.0091, "num_tokens": 189364142.0, "reward": 0.46550217270851135, "reward_std": 0.1155625581741333, "rewards/code_reward/mean": 0.3662834167480469, "rewards/code_reward/std": 0.3531208336353302, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 298.642578125, "completions/mean_terminated_length": 298.642578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7448522829006267, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3205092080963518, "kl": 0.157958984375, "learning_rate": 8.325121467740695e-07, "loss": 0.0265, "num_tokens": 189831615.0, "reward": 0.5242391228675842, "reward_std": 0.12781904637813568, "rewards/code_reward/mean": 0.42462974786758423, "rewards/code_reward/std": 0.3863951563835144, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 304.1640625, "completions/mean_terminated_length": 304.1640625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.7466427931960609, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3178454452731441, "kl": 0.16552734375, "learning_rate": 8.249272652833226e-07, "loss": 0.0238, "num_tokens": 190296163.0, "reward": 0.5295183658599854, "reward_std": 0.15451239049434662, "rewards/code_reward/mean": 0.43029963970184326, "rewards/code_reward/std": 0.3693654537200928, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 285.357421875, "completions/mean_terminated_length": 285.357421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7484333034914951, "frac_reward_zero_std": 0.296875, "grad_norm": 0.31329639121370967, "kl": 0.150634765625, "learning_rate": 8.174231559889931e-07, "loss": 0.003, "num_tokens": 190752698.0, "reward": 0.5299686193466187, "reward_std": 0.10592755675315857, "rewards/code_reward/mean": 0.43035924434661865, "rewards/code_reward/std": 0.39202961325645447, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 286.634765625, "completions/mean_terminated_length": 286.634765625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7502238137869293, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3508000098261981, "kl": 0.156494140625, "learning_rate": 8.100001337484787e-07, "loss": 0.0275, "num_tokens": 191237095.0, "reward": 0.497197687625885, "reward_std": 0.10232062637805939, "rewards/code_reward/mean": 0.39778363704681396, "rewards/code_reward/std": 0.36757269501686096, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 290.484375, "completions/mean_terminated_length": 287.0450134277344, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7520143240823635, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3448468148215436, "kl": 0.15283203125, "learning_rate": 8.026585100169251e-07, "loss": 0.0345, "num_tokens": 191703487.0, "reward": 0.48169082403182983, "reward_std": 0.11510834842920303, "rewards/code_reward/mean": 0.38247209787368774, "rewards/code_reward/std": 0.334516704082489, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 285.552734375, "completions/mean_terminated_length": 285.552734375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7538048343777977, "frac_reward_zero_std": 0.265625, "grad_norm": 0.33253084970296726, "kl": 0.156982421875, "learning_rate": 7.953985928341601e-07, "loss": 0.0202, "num_tokens": 192123362.0, "reward": 0.45922157168388367, "reward_std": 0.11238031089305878, "rewards/code_reward/mean": 0.35980749130249023, "rewards/code_reward/std": 0.35185176134109497, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 288.994140625, "completions/mean_terminated_length": 288.994140625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7555953446732319, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3016528849947574, "kl": 0.15771484375, "learning_rate": 7.882206868117693e-07, "loss": 0.0036, "num_tokens": 192589399.0, "reward": 0.5083057880401611, "reward_std": 0.10758490115404129, "rewards/code_reward/mean": 0.409282386302948, "rewards/code_reward/std": 0.3773415982723236, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 288.349609375, "completions/mean_terminated_length": 288.349609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.7573858549686661, "frac_reward_zero_std": 0.25, "grad_norm": 0.3381061742296827, "kl": 0.161865234375, "learning_rate": 7.81125093120313e-07, "loss": 0.0279, "num_tokens": 193024170.0, "reward": 0.5243711471557617, "reward_std": 0.12894684076309204, "rewards/code_reward/mean": 0.42573830485343933, "rewards/code_reward/std": 0.4012729823589325, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 286.166015625, "completions/mean_terminated_length": 286.166015625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7591763652641003, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3215579468766339, "kl": 0.181396484375, "learning_rate": 7.741121094766916e-07, "loss": 0.0109, "num_tokens": 193514543.0, "reward": 0.4932405352592468, "reward_std": 0.08172842860221863, "rewards/code_reward/mean": 0.39402174949645996, "rewards/code_reward/std": 0.3680379390716553, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 291.46875, "completions/mean_terminated_length": 291.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7609668755595345, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3112954674482282, "kl": 0.15185546875, "learning_rate": 7.671820301316532e-07, "loss": 0.0061, "num_tokens": 193986567.0, "reward": 0.5019032955169678, "reward_std": 0.11825037002563477, "rewards/code_reward/mean": 0.40248924493789673, "rewards/code_reward/std": 0.3791036903858185, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 291.30859375, "completions/mean_terminated_length": 291.30859375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7627573858549687, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3231080753851668, "kl": 0.156005859375, "learning_rate": 7.603351458574474e-07, "loss": 0.0105, "num_tokens": 194425413.0, "reward": 0.49471795558929443, "reward_std": 0.11553268134593964, "rewards/code_reward/mean": 0.39608514308929443, "rewards/code_reward/std": 0.3794829845428467, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 284.47265625, "completions/mean_terminated_length": 284.47265625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.7645478961504029, "frac_reward_zero_std": 0.296875, "grad_norm": 0.33751785804827233, "kl": 0.154052734375, "learning_rate": 7.535717439356255e-07, "loss": 0.0186, "num_tokens": 194863167.0, "reward": 0.49711838364601135, "reward_std": 0.10482794046401978, "rewards/code_reward/mean": 0.39809495210647583, "rewards/code_reward/std": 0.3436027467250824, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 275.8203125, "completions/mean_terminated_length": 275.8203125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7663384064458371, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3443225300836166, "kl": 0.17041015625, "learning_rate": 7.46892108144986e-07, "loss": 0.0248, "num_tokens": 195303739.0, "reward": 0.541577935218811, "reward_std": 0.11373305320739746, "rewards/code_reward/mean": 0.4427497684955597, "rewards/code_reward/std": 0.3838665187358856, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 288.966796875, "completions/mean_terminated_length": 284.47845458984375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7681289167412713, "frac_reward_zero_std": 0.3125, "grad_norm": 1.891980392895429, "kl": 0.169921875, "learning_rate": 7.402965187496697e-07, "loss": 0.0379, "num_tokens": 195762842.0, "reward": 0.49835777282714844, "reward_std": 0.1028522476553917, "rewards/code_reward/mean": 0.3991389870643616, "rewards/code_reward/std": 0.39109355211257935, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 294.529296875, "completions/mean_terminated_length": 294.529296875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7699194270367055, "frac_reward_zero_std": 0.203125, "grad_norm": 0.34157120245619155, "kl": 0.157470703125, "learning_rate": 7.337852524873974e-07, "loss": 0.0241, "num_tokens": 196258073.0, "reward": 0.4455525577068329, "reward_std": 0.08384969830513, "rewards/code_reward/mean": 0.3467244505882263, "rewards/code_reward/std": 0.3544946312904358, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 280.0703125, "completions/mean_terminated_length": 280.0703125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7717099373321397, "frac_reward_zero_std": 0.25, "grad_norm": 0.33246329194909474, "kl": 0.1669921875, "learning_rate": 7.273585825578608e-07, "loss": 0.0161, "num_tokens": 196698205.0, "reward": 0.48979827761650085, "reward_std": 0.10945957899093628, "rewards/code_reward/mean": 0.3915560841560364, "rewards/code_reward/std": 0.3678015172481537, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 281.5859375, "completions/mean_terminated_length": 281.5859375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7735004476275739, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3425214289438743, "kl": 0.156005859375, "learning_rate": 7.21016778611259e-07, "loss": -0.0224, "num_tokens": 197130905.0, "reward": 0.5075055360794067, "reward_std": 0.10311309248209, "rewards/code_reward/mean": 0.40848207473754883, "rewards/code_reward/std": 0.3589523136615753, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 287.34375, "completions/mean_terminated_length": 287.34375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7752909579230081, "frac_reward_zero_std": 0.25, "grad_norm": 0.332605774043555, "kl": 0.1650390625, "learning_rate": 7.147601067369835e-07, "loss": 0.0301, "num_tokens": 197602097.0, "reward": 0.4721888303756714, "reward_std": 0.09201394021511078, "rewards/code_reward/mean": 0.37277474999427795, "rewards/code_reward/std": 0.36664527654647827, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 297.84375, "completions/mean_terminated_length": 294.4187927246094, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7770814682184423, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3175833887850813, "kl": 0.150634765625, "learning_rate": 7.085888294524561e-07, "loss": 0.052, "num_tokens": 198060697.0, "reward": 0.5086120963096619, "reward_std": 0.09990690648555756, "rewards/code_reward/mean": 0.40997928380966187, "rewards/code_reward/std": 0.35522231459617615, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 277.677734375, "completions/mean_terminated_length": 277.677734375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7788719785138765, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3316259964912605, "kl": 0.15576171875, "learning_rate": 7.025032056921117e-07, "loss": 0.0086, "num_tokens": 198522164.0, "reward": 0.5326995849609375, "reward_std": 0.11462061107158661, "rewards/code_reward/mean": 0.43328556418418884, "rewards/code_reward/std": 0.3760284185409546, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 287.361328125, "completions/mean_terminated_length": 287.361328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7806624888093107, "frac_reward_zero_std": 0.28125, "grad_norm": 0.31578232357642205, "kl": 0.152587890625, "learning_rate": 6.965034907965349e-07, "loss": 0.0244, "num_tokens": 198977037.0, "reward": 0.5374908447265625, "reward_std": 0.10307030379772186, "rewards/code_reward/mean": 0.4378814697265625, "rewards/code_reward/std": 0.38291504979133606, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 277.34375, "completions/mean_terminated_length": 277.34375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.7824529991047449, "frac_reward_zero_std": 0.3125, "grad_norm": 0.31972696641248005, "kl": 0.155517578125, "learning_rate": 6.905899365017462e-07, "loss": -0.0107, "num_tokens": 199426525.0, "reward": 0.5066140294075012, "reward_std": 0.0763346254825592, "rewards/code_reward/mean": 0.40739524364471436, "rewards/code_reward/std": 0.36438363790512085, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 288.580078125, "completions/mean_terminated_length": 288.580078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7842435094001791, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3144189121632669, "kl": 0.148193359375, "learning_rate": 6.847627909286409e-07, "loss": -0.0036, "num_tokens": 199923294.0, "reward": 0.45676374435424805, "reward_std": 0.12470542639493942, "rewards/code_reward/mean": 0.357349693775177, "rewards/code_reward/std": 0.35051921010017395, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 274.916015625, "completions/mean_terminated_length": 274.916015625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7860340196956133, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3374887726532489, "kl": 0.16455078125, "learning_rate": 6.790222985725761e-07, "loss": -0.0012, "num_tokens": 200385427.0, "reward": 0.468255877494812, "reward_std": 0.07880926877260208, "rewards/code_reward/mean": 0.3688417673110962, "rewards/code_reward/std": 0.3439018428325653, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 280.736328125, "completions/mean_terminated_length": 280.736328125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7878245299910475, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3318126500977582, "kl": 0.15673828125, "learning_rate": 6.733687002931141e-07, "loss": 0.0022, "num_tokens": 200818892.0, "reward": 0.47940531373023987, "reward_std": 0.10509155690670013, "rewards/code_reward/mean": 0.3805771768093109, "rewards/code_reward/std": 0.33890971541404724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 292.8671875, "completions/mean_terminated_length": 292.8671875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7896150402864817, "frac_reward_zero_std": 0.375, "grad_norm": 0.3100086962685305, "kl": 0.15234375, "learning_rate": 6.678022333039158e-07, "loss": 0.0315, "num_tokens": 201287200.0, "reward": 0.4889255166053772, "reward_std": 0.06793921440839767, "rewards/code_reward/mean": 0.3899020552635193, "rewards/code_reward/std": 0.36815646290779114, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 286.3828125, "completions/mean_terminated_length": 286.3828125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7914055505819159, "frac_reward_zero_std": 0.34375, "grad_norm": 0.2998225831850496, "kl": 0.15380859375, "learning_rate": 6.623231311627876e-07, "loss": 0.0044, "num_tokens": 201725220.0, "reward": 0.5353275537490845, "reward_std": 0.09797606617212296, "rewards/code_reward/mean": 0.43591347336769104, "rewards/code_reward/std": 0.4137617349624634, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 298.380859375, "completions/mean_terminated_length": 298.380859375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.7931960608773501, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3156056583271159, "kl": 0.156494140625, "learning_rate": 6.569316237618811e-07, "loss": 0.033, "num_tokens": 202176463.0, "reward": 0.4418387711048126, "reward_std": 0.12273675948381424, "rewards/code_reward/mean": 0.34222936630249023, "rewards/code_reward/std": 0.3645772933959961, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 282.908203125, "completions/mean_terminated_length": 282.908203125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7949865711727843, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3363395031862533, "kl": 0.14990234375, "learning_rate": 6.516279373180499e-07, "loss": -0.0059, "num_tokens": 202599888.0, "reward": 0.4658171832561493, "reward_std": 0.12160781770944595, "rewards/code_reward/mean": 0.3665984272956848, "rewards/code_reward/std": 0.36045578122138977, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 290.23046875, "completions/mean_terminated_length": 290.23046875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7967770814682185, "frac_reward_zero_std": 0.25, "grad_norm": 0.32497169446283136, "kl": 0.15625, "learning_rate": 6.464122943633543e-07, "loss": 0.0202, "num_tokens": 203058262.0, "reward": 0.516558051109314, "reward_std": 0.13307727873325348, "rewards/code_reward/mean": 0.4171440005302429, "rewards/code_reward/std": 0.36391547322273254, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 286.955078125, "completions/mean_terminated_length": 286.955078125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7985675917636527, "frac_reward_zero_std": 0.296875, "grad_norm": 0.32594933937368764, "kl": 0.151611328125, "learning_rate": 6.412849137357271e-07, "loss": 0.0083, "num_tokens": 203514095.0, "reward": 0.4698641002178192, "reward_std": 0.09007802605628967, "rewards/code_reward/mean": 0.3704500198364258, "rewards/code_reward/std": 0.37283194065093994, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 291.33203125, "completions/mean_terminated_length": 287.8943176269531, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8003581020590869, "frac_reward_zero_std": 0.328125, "grad_norm": 0.295787071130766, "kl": 0.14453125, "learning_rate": 6.3624601056979e-07, "loss": -0.0011, "num_tokens": 203950833.0, "reward": 0.5455830097198486, "reward_std": 0.09754446148872375, "rewards/code_reward/mean": 0.44695019721984863, "rewards/code_reward/std": 0.4033321142196655, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 284.51171875, "completions/mean_terminated_length": 283.9608459472656, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8021486123545211, "frac_reward_zero_std": 0.359375, "grad_norm": 0.36513191478678514, "kl": 0.28369140625, "learning_rate": 6.312957962878278e-07, "loss": 0.0228, "num_tokens": 204428407.0, "reward": 0.5308598279953003, "reward_std": 0.09614521265029907, "rewards/code_reward/mean": 0.43183642625808716, "rewards/code_reward/std": 0.3622574806213379, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 276.935546875, "completions/mean_terminated_length": 276.935546875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8039391226499553, "frac_reward_zero_std": 0.15625, "grad_norm": 0.37948353344543273, "kl": 0.161376953125, "learning_rate": 6.264344785909181e-07, "loss": 0.0119, "num_tokens": 204890206.0, "reward": 0.4910258650779724, "reward_std": 0.12765184044837952, "rewards/code_reward/mean": 0.3916117548942566, "rewards/code_reward/std": 0.3686625361442566, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 291.1640625, "completions/mean_terminated_length": 291.1640625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8057296329453895, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3269502589284492, "kl": 0.163818359375, "learning_rate": 6.216622614502149e-07, "loss": 0.0199, "num_tokens": 205355290.0, "reward": 0.5571842193603516, "reward_std": 0.11522470414638519, "rewards/code_reward/mean": 0.45777010917663574, "rewards/code_reward/std": 0.3654005825519562, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 296.451171875, "completions/mean_terminated_length": 295.618408203125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8075201432408237, "frac_reward_zero_std": 0.328125, "grad_norm": 7.669786793840939, "kl": 4.96630859375, "learning_rate": 6.169793450983916e-07, "loss": 0.0618, "num_tokens": 205839593.0, "reward": 0.43700623512268066, "reward_std": 0.0928126648068428, "rewards/code_reward/mean": 0.33837342262268066, "rewards/code_reward/std": 0.3464515805244446, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 279.95703125, "completions/mean_terminated_length": 279.95703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8093106535362579, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3354435916879996, "kl": 0.14892578125, "learning_rate": 6.123859260212393e-07, "loss": 0.0115, "num_tokens": 206285763.0, "reward": 0.5241540670394897, "reward_std": 0.09595154225826263, "rewards/code_reward/mean": 0.4251306354999542, "rewards/code_reward/std": 0.3574601411819458, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 305.0703125, "completions/mean_terminated_length": 305.0703125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8111011638316921, "frac_reward_zero_std": 0.421875, "grad_norm": 0.27269241665843946, "kl": 0.144775390625, "learning_rate": 6.07882196949423e-07, "loss": 0.0342, "num_tokens": 206759655.0, "reward": 0.5184692144393921, "reward_std": 0.08284525573253632, "rewards/code_reward/mean": 0.41905510425567627, "rewards/code_reward/std": 0.4001327157020569, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 294.693359375, "completions/mean_terminated_length": 294.693359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8128916741271263, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3551875444295618, "kl": 0.1533203125, "learning_rate": 6.034683468503948e-07, "loss": 0.0333, "num_tokens": 207204426.0, "reward": 0.4810914099216461, "reward_std": 0.12191994488239288, "rewards/code_reward/mean": 0.38187265396118164, "rewards/code_reward/std": 0.36706313490867615, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 310.40625, "completions/mean_terminated_length": 310.40625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8146821844225605, "frac_reward_zero_std": 0.265625, "grad_norm": 0.33137707723782145, "kl": 0.1494140625, "learning_rate": 5.991445609204641e-07, "loss": 0.0295, "num_tokens": 207693314.0, "reward": 0.46103185415267944, "reward_std": 0.10157027840614319, "rewards/code_reward/mean": 0.36142244935035706, "rewards/code_reward/std": 0.31363171339035034, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 289.751953125, "completions/mean_terminated_length": 288.7377624511719, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8164726947179947, "frac_reward_zero_std": 0.25, "grad_norm": 0.8815302476672208, "kl": 0.5634765625, "learning_rate": 5.949110205770292e-07, "loss": 0.0141, "num_tokens": 208156243.0, "reward": 0.5502228736877441, "reward_std": 0.1270667314529419, "rewards/code_reward/mean": 0.4508087635040283, "rewards/code_reward/std": 0.3920993208885193, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 293.765625, "completions/mean_terminated_length": 292.7632141113281, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8182632050134289, "frac_reward_zero_std": 0.34375, "grad_norm": 0.5020556850370741, "kl": 0.225830078125, "learning_rate": 5.90767903450964e-07, "loss": 0.0091, "num_tokens": 208640043.0, "reward": 0.5585739612579346, "reward_std": 0.08153955638408661, "rewards/code_reward/mean": 0.46013641357421875, "rewards/code_reward/std": 0.3777037262916565, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 290.072265625, "completions/mean_terminated_length": 290.072265625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8200537153088631, "frac_reward_zero_std": 0.296875, "grad_norm": 0.32151677522703725, "kl": 0.15283203125, "learning_rate": 5.867153833791652e-07, "loss": 0.0146, "num_tokens": 209080584.0, "reward": 0.4321288466453552, "reward_std": 0.06435224413871765, "rewards/code_reward/mean": 0.3331053853034973, "rewards/code_reward/std": 0.35965171456336975, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 284.20703125, "completions/mean_terminated_length": 284.20703125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8218442256042973, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3784712664936248, "kl": 0.177978515625, "learning_rate": 5.827536303972587e-07, "loss": 0.0473, "num_tokens": 209537058.0, "reward": 0.5829298496246338, "reward_std": 0.09573233127593994, "rewards/code_reward/mean": 0.4837110936641693, "rewards/code_reward/std": 0.3929855227470398, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 289.65234375, "completions/mean_terminated_length": 289.65234375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8236347358997315, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3381223494547023, "kl": 0.148193359375, "learning_rate": 5.78882810732465e-07, "loss": 0.0024, "num_tokens": 209967984.0, "reward": 0.5236642360687256, "reward_std": 0.11529242247343063, "rewards/code_reward/mean": 0.42366424202919006, "rewards/code_reward/std": 0.34745559096336365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 299.583984375, "completions/mean_terminated_length": 299.583984375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8254252461951657, "frac_reward_zero_std": 0.390625, "grad_norm": 0.28888875734340597, "kl": 0.155029296875, "learning_rate": 5.75103086796625e-07, "loss": 0.0028, "num_tokens": 210428587.0, "reward": 0.5320026874542236, "reward_std": 0.09544634819030762, "rewards/code_reward/mean": 0.43239325284957886, "rewards/code_reward/std": 0.389309287071228, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 294.380859375, "completions/mean_terminated_length": 294.380859375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8272157564905999, "frac_reward_zero_std": 0.296875, "grad_norm": 0.31344687064202864, "kl": 0.1474609375, "learning_rate": 5.714146171793846e-07, "loss": 0.015, "num_tokens": 210913590.0, "reward": 0.4630996584892273, "reward_std": 0.0645325630903244, "rewards/code_reward/mean": 0.3638809323310852, "rewards/code_reward/std": 0.3631923198699951, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 287.84765625, "completions/mean_terminated_length": 287.84765625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8290062667860341, "frac_reward_zero_std": 0.34375, "grad_norm": 0.30754240445927394, "kl": 0.15234375, "learning_rate": 5.678175566415422e-07, "loss": 0.0133, "num_tokens": 211373944.0, "reward": 0.5162513256072998, "reward_std": 0.0676489993929863, "rewards/code_reward/mean": 0.4172278940677643, "rewards/code_reward/std": 0.3563990592956543, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 282.06640625, "completions/mean_terminated_length": 278.6105651855469, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8307967770814683, "frac_reward_zero_std": 0.265625, "grad_norm": 0.33601007501039676, "kl": 0.163330078125, "learning_rate": 5.643120561085528e-07, "loss": 0.0259, "num_tokens": 211833474.0, "reward": 0.5526015758514404, "reward_std": 0.106254443526268, "rewards/code_reward/mean": 0.4535781443119049, "rewards/code_reward/std": 0.392835795879364, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 302.984375, "completions/mean_terminated_length": 302.984375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8325872873769025, "frac_reward_zero_std": 0.328125, "grad_norm": 0.2969042018526191, "kl": 0.14599609375, "learning_rate": 5.608982626641991e-07, "loss": 0.0017, "num_tokens": 212282346.0, "reward": 0.516247034072876, "reward_std": 0.09091551601886749, "rewards/code_reward/mean": 0.4170282781124115, "rewards/code_reward/std": 0.38079655170440674, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 293.158203125, "completions/mean_terminated_length": 289.72406005859375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8343777976723367, "frac_reward_zero_std": 0.265625, "grad_norm": 0.31736718436707423, "kl": 0.14697265625, "learning_rate": 5.575763195444166e-07, "loss": 0.0337, "num_tokens": 212738019.0, "reward": 0.4943723678588867, "reward_std": 0.11701977252960205, "rewards/code_reward/mean": 0.3947629928588867, "rewards/code_reward/std": 0.3453756868839264, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 283.345703125, "completions/mean_terminated_length": 283.345703125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.8361683079677709, "frac_reward_zero_std": 0.28125, "grad_norm": 0.30608589344413545, "kl": 0.16455078125, "learning_rate": 5.543463661312847e-07, "loss": -0.011, "num_tokens": 213224476.0, "reward": 0.4604203402996063, "reward_std": 0.10184351354837418, "rewards/code_reward/mean": 0.36081093549728394, "rewards/code_reward/std": 0.3494998514652252, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 287.681640625, "completions/mean_terminated_length": 287.681640625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8379588182632051, "frac_reward_zero_std": 0.25, "grad_norm": 0.3158461293138423, "kl": 0.144287109375, "learning_rate": 5.512085379471808e-07, "loss": 0.0236, "num_tokens": 213675513.0, "reward": 0.47279059886932373, "reward_std": 0.09371345490217209, "rewards/code_reward/mean": 0.37298592925071716, "rewards/code_reward/std": 0.3752439022064209, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 285.91015625, "completions/mean_terminated_length": 285.91015625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8397493285586393, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3103920998460024, "kl": 0.150146484375, "learning_rate": 5.481629666490903e-07, "loss": 0.0224, "num_tokens": 214140595.0, "reward": 0.5693438053131104, "reward_std": 0.11439032107591629, "rewards/code_reward/mean": 0.4701250195503235, "rewards/code_reward/std": 0.4059833884239197, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 286.369140625, "completions/mean_terminated_length": 286.369140625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8415398388540735, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3273248968125848, "kl": 0.148681640625, "learning_rate": 5.452097800230853e-07, "loss": -0.0059, "num_tokens": 214588544.0, "reward": 0.5359824299812317, "reward_std": 0.1152547150850296, "rewards/code_reward/mean": 0.4363730847835541, "rewards/code_reward/std": 0.36495664715766907, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 277.80078125, "completions/mean_terminated_length": 277.80078125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8433303491495077, "frac_reward_zero_std": 0.1875, "grad_norm": 0.35059942558388646, "kl": 0.163330078125, "learning_rate": 5.423491019789623e-07, "loss": 0.0215, "num_tokens": 215056290.0, "reward": 0.4775276482105255, "reward_std": 0.12117785960435867, "rewards/code_reward/mean": 0.37772294878959656, "rewards/code_reward/std": 0.33281344175338745, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 284.39453125, "completions/mean_terminated_length": 284.39453125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8451208594449419, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3145385732107268, "kl": 0.15380859375, "learning_rate": 5.395810525450425e-07, "loss": 0.0167, "num_tokens": 215513228.0, "reward": 0.5176160335540771, "reward_std": 0.12138864398002625, "rewards/code_reward/mean": 0.41820192337036133, "rewards/code_reward/std": 0.383306086063385, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 281.123046875, "completions/mean_terminated_length": 281.123046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.846911369740376, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3296217958015705, "kl": 0.156494140625, "learning_rate": 5.369057478631359e-07, "loss": 0.0152, "num_tokens": 215947027.0, "reward": 0.5281923413276672, "reward_std": 0.11802750825881958, "rewards/code_reward/mean": 0.4287782609462738, "rewards/code_reward/std": 0.39612579345703125, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 288.388671875, "completions/mean_terminated_length": 288.388671875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8487018800358102, "frac_reward_zero_std": 0.265625, "grad_norm": 0.3358563844149769, "kl": 0.15283203125, "learning_rate": 5.343233001836694e-07, "loss": 0.0242, "num_tokens": 216412858.0, "reward": 0.48578667640686035, "reward_std": 0.08888278901576996, "rewards/code_reward/mean": 0.38715386390686035, "rewards/code_reward/std": 0.3608415722846985, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 286.419921875, "completions/mean_terminated_length": 286.419921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8504923903312444, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3378895680208083, "kl": 0.16015625, "learning_rate": 5.318338178609754e-07, "loss": 0.0161, "num_tokens": 216876817.0, "reward": 0.5365920066833496, "reward_std": 0.1062762588262558, "rewards/code_reward/mean": 0.43737322092056274, "rewards/code_reward/std": 0.36626142263412476, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 275.818359375, "completions/mean_terminated_length": 275.818359375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8522829006266786, "frac_reward_zero_std": 0.21875, "grad_norm": 0.3421375277489184, "kl": 0.15673828125, "learning_rate": 5.294374053487459e-07, "loss": 0.0222, "num_tokens": 217310436.0, "reward": 0.534490704536438, "reward_std": 0.1259656846523285, "rewards/code_reward/mean": 0.4358578324317932, "rewards/code_reward/std": 0.3666819632053375, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 268.1640625, "completions/mean_terminated_length": 268.1640625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.8540734109221128, "frac_reward_zero_std": 0.25, "grad_norm": 0.35099897687992027, "kl": 0.155517578125, "learning_rate": 5.271341631956511e-07, "loss": 0.0266, "num_tokens": 217774336.0, "reward": 0.5272774696350098, "reward_std": 0.11544080078601837, "rewards/code_reward/mean": 0.42825406789779663, "rewards/code_reward/std": 0.3779984712600708, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 280.998046875, "completions/mean_terminated_length": 280.998046875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.855863921217547, "frac_reward_zero_std": 0.296875, "grad_norm": 0.2988801966533657, "kl": 0.1572265625, "learning_rate": 5.249241880411181e-07, "loss": 0.0081, "num_tokens": 218203935.0, "reward": 0.4684543311595917, "reward_std": 0.10115277767181396, "rewards/code_reward/mean": 0.37001681327819824, "rewards/code_reward/std": 0.368169903755188, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 285.828125, "completions/mean_terminated_length": 285.828125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8576544315129812, "frac_reward_zero_std": 0.328125, "grad_norm": 0.30927242840738844, "kl": 0.153564453125, "learning_rate": 5.228075726112785e-07, "loss": 0.0408, "num_tokens": 218683063.0, "reward": 0.5400015115737915, "reward_std": 0.09559574723243713, "rewards/code_reward/mean": 0.44058743119239807, "rewards/code_reward/std": 0.3887256681919098, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 272.46484375, "completions/mean_terminated_length": 272.46484375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8594449418084154, "frac_reward_zero_std": 0.328125, "grad_norm": 0.32229988587887864, "kl": 0.16259765625, "learning_rate": 5.207844057150768e-07, "loss": 0.0186, "num_tokens": 219131757.0, "reward": 0.514305830001831, "reward_std": 0.07932621985673904, "rewards/code_reward/mean": 0.41528236865997314, "rewards/code_reward/std": 0.38885927200317383, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 299.109375, "completions/mean_terminated_length": 299.109375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.8612354521038496, "frac_reward_zero_std": 0.34375, "grad_norm": 0.30711427098440236, "kl": 0.1484375, "learning_rate": 5.188547722405437e-07, "loss": 0.0129, "num_tokens": 219591501.0, "reward": 0.4486275315284729, "reward_std": 0.07159635424613953, "rewards/code_reward/mean": 0.34940874576568604, "rewards/code_reward/std": 0.33191025257110596, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 282.416015625, "completions/mean_terminated_length": 282.416015625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8630259623992838, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3361220962623221, "kl": 0.165771484375, "learning_rate": 5.170187531512351e-07, "loss": 0.0116, "num_tokens": 220032426.0, "reward": 0.4597012400627136, "reward_std": 0.11640267819166183, "rewards/code_reward/mean": 0.36048251390457153, "rewards/code_reward/std": 0.34874680638313293, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 282.314453125, "completions/mean_terminated_length": 282.314453125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.864816472694718, "frac_reward_zero_std": 0.1875, "grad_norm": 0.34330494648955096, "kl": 0.16845703125, "learning_rate": 5.152764254828348e-07, "loss": 0.0139, "num_tokens": 220490059.0, "reward": 0.4551682770252228, "reward_std": 0.12994812428951263, "rewards/code_reward/mean": 0.3563401401042938, "rewards/code_reward/std": 0.3381604552268982, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 276.501953125, "completions/mean_terminated_length": 276.501953125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8666069829901522, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3133554673620963, "kl": 0.142822265625, "learning_rate": 5.136278623399225e-07, "loss": -0.0065, "num_tokens": 220931612.0, "reward": 0.48414239287376404, "reward_std": 0.10344494134187698, "rewards/code_reward/mean": 0.38453301787376404, "rewards/code_reward/std": 0.3684546649456024, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 289.662109375, "completions/mean_terminated_length": 286.22113037109375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8683974932855864, "frac_reward_zero_std": 0.28125, "grad_norm": 0.31679643682233977, "kl": 0.158935546875, "learning_rate": 5.120731328929058e-07, "loss": 0.0247, "num_tokens": 221397343.0, "reward": 0.4413906931877136, "reward_std": 0.09121271967887878, "rewards/code_reward/mean": 0.34217196702957153, "rewards/code_reward/std": 0.3609677851200104, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 270.052734375, "completions/mean_terminated_length": 270.052734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.8701880035810206, "frac_reward_zero_std": 0.203125, "grad_norm": 0.36118511120723545, "kl": 0.185791015625, "learning_rate": 5.106123023751187e-07, "loss": 0.0159, "num_tokens": 221844674.0, "reward": 0.41520363092422485, "reward_std": 0.0882403627038002, "rewards/code_reward/mean": 0.3159848749637604, "rewards/code_reward/std": 0.3095768094062805, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 281.70703125, "completions/mean_terminated_length": 281.70703125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8719785138764548, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3145598517509606, "kl": 0.151123046875, "learning_rate": 5.092454320800833e-07, "loss": 0.0006, "num_tokens": 222289444.0, "reward": 0.5400997996330261, "reward_std": 0.10451820492744446, "rewards/code_reward/mean": 0.4406857490539551, "rewards/code_reward/std": 0.3702850043773651, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 287.0703125, "completions/mean_terminated_length": 287.0703125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.873769024171889, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3412019101691059, "kl": 0.16015625, "learning_rate": 5.079725793589405e-07, "loss": 0.0236, "num_tokens": 222764480.0, "reward": 0.48061403632164, "reward_std": 0.10210969299077988, "rewards/code_reward/mean": 0.38237184286117554, "rewards/code_reward/std": 0.34035080671310425, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 273.416015625, "completions/mean_terminated_length": 273.416015625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8755595344673232, "frac_reward_zero_std": 0.359375, "grad_norm": 0.30686684783837476, "kl": 0.161376953125, "learning_rate": 5.067937976180407e-07, "loss": 0.0104, "num_tokens": 223232285.0, "reward": 0.5182619094848633, "reward_std": 0.06647691130638123, "rewards/code_reward/mean": 0.4190431833267212, "rewards/code_reward/std": 0.36016395688056946, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 286.716796875, "completions/mean_terminated_length": 279.809814453125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8773500447627574, "frac_reward_zero_std": 0.21875, "grad_norm": 0.33929805383461065, "kl": 0.15625, "learning_rate": 5.057091363167046e-07, "loss": 0.055, "num_tokens": 223684908.0, "reward": 0.47709038853645325, "reward_std": 0.09164555370807648, "rewards/code_reward/mean": 0.3786528706550598, "rewards/code_reward/std": 0.3516968786716461, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 275.845703125, "completions/mean_terminated_length": 272.377685546875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8791405550581916, "frac_reward_zero_std": 0.25, "grad_norm": 0.33561090360542434, "kl": 0.1474609375, "learning_rate": 5.047186409651489e-07, "loss": 0.0275, "num_tokens": 224141805.0, "reward": 0.5316396355628967, "reward_std": 0.1020936518907547, "rewards/code_reward/mean": 0.43242084980010986, "rewards/code_reward/std": 0.36819201707839966, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 283.111328125, "completions/mean_terminated_length": 283.111328125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8809310653536258, "frac_reward_zero_std": 0.34375, "grad_norm": 0.30623843820556473, "kl": 0.1572265625, "learning_rate": 5.038223531225742e-07, "loss": 0.0025, "num_tokens": 224607694.0, "reward": 0.5654200911521912, "reward_std": 0.09963131695985794, "rewards/code_reward/mean": 0.46581071615219116, "rewards/code_reward/std": 0.383987694978714, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 280.71875, "completions/mean_terminated_length": 280.71875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.88272157564906, "frac_reward_zero_std": 0.203125, "grad_norm": 0.3378886175674951, "kl": 0.155029296875, "learning_rate": 5.030203103954232e-07, "loss": 0.0214, "num_tokens": 225040990.0, "reward": 0.47813862562179565, "reward_std": 0.0889197438955307, "rewards/code_reward/mean": 0.3793104887008667, "rewards/code_reward/std": 0.36503762006759644, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 282.013671875, "completions/mean_terminated_length": 282.013671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8845120859444942, "frac_reward_zero_std": 0.234375, "grad_norm": 0.33521220786278016, "kl": 0.161376953125, "learning_rate": 5.023125464358026e-07, "loss": 0.0252, "num_tokens": 225462469.0, "reward": 0.5066580176353455, "reward_std": 0.0951513797044754, "rewards/code_reward/mean": 0.4078298807144165, "rewards/code_reward/std": 0.3673756420612335, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 289.583984375, "completions/mean_terminated_length": 289.583984375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8863025962399284, "frac_reward_zero_std": 0.25, "grad_norm": 0.323275256593473, "kl": 0.157958984375, "learning_rate": 5.016990909400709e-07, "loss": 0.0252, "num_tokens": 225914368.0, "reward": 0.5507107973098755, "reward_std": 0.12741786241531372, "rewards/code_reward/mean": 0.45129674673080444, "rewards/code_reward/std": 0.3367266058921814, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 293.0859375, "completions/mean_terminated_length": 293.0859375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8880931065353626, "frac_reward_zero_std": 0.296875, "grad_norm": 0.3076098170281073, "kl": 0.1513671875, "learning_rate": 5.011799696475915e-07, "loss": -0.005, "num_tokens": 226376116.0, "reward": 0.5403420329093933, "reward_std": 0.10962044447660446, "rewards/code_reward/mean": 0.4411233067512512, "rewards/code_reward/std": 0.37339821457862854, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 283.0390625, "completions/mean_terminated_length": 283.0390625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8898836168307968, "frac_reward_zero_std": 0.265625, "grad_norm": 0.36999730232034106, "kl": 0.1572265625, "learning_rate": 5.007552043396547e-07, "loss": 0.0354, "num_tokens": 226826416.0, "reward": 0.48462194204330444, "reward_std": 0.08926527202129364, "rewards/code_reward/mean": 0.38559848070144653, "rewards/code_reward/std": 0.36074957251548767, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 284.650390625, "completions/mean_terminated_length": 284.650390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.891674127126231, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3119945034610699, "kl": 0.147216796875, "learning_rate": 5.004248128385618e-07, "loss": 0.0106, "num_tokens": 227270693.0, "reward": 0.47703859210014343, "reward_std": 0.0954718068242073, "rewards/code_reward/mean": 0.3772338926792145, "rewards/code_reward/std": 0.32804709672927856, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 298.841796875, "completions/mean_terminated_length": 298.841796875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8934646374216652, "frac_reward_zero_std": 0.234375, "grad_norm": 0.3284309811727421, "kl": 0.148681640625, "learning_rate": 5.001888090068784e-07, "loss": 0.0105, "num_tokens": 227733484.0, "reward": 0.519349217414856, "reward_std": 0.13681122660636902, "rewards/code_reward/mean": 0.420521080493927, "rewards/code_reward/std": 0.3838867247104645, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 284.859375, "completions/mean_terminated_length": 284.859375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8952551477170994, "frac_reward_zero_std": 0.328125, "grad_norm": 0.3167818186963849, "kl": 0.15869140625, "learning_rate": 5.000472027468528e-07, "loss": 0.0114, "num_tokens": 228191420.0, "reward": 0.46505969762802124, "reward_std": 0.10095734894275665, "rewards/code_reward/mean": 0.36584094166755676, "rewards/code_reward/std": 0.34102559089660645, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "step": 500 }, { "epoch": 0.8952551477170994, "step": 500, "total_flos": 0.0, "train_loss": 0.07975299004558474, "train_runtime": 60375.5417, "train_samples_per_second": 4.24, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 228191420, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }