{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.33921302578018997, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1914.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3331.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 1914.25, "completions/mean_terminated_length": 1914.25, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.00033921302578018993, "frac_reward_zero_std": 0.0, "grad_norm": 0.11923123896121979, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0003, "num_tokens": 36909.0, "reward": 0.6750000715255737, "reward_std": 0.061237238347530365, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1 }, { "completion_length": 358.5, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.0006784260515603799, "frac_reward_zero_std": 0.0, "grad_norm": 0.16774195432662964, "kl": 0.0, "learning_rate": 1e-08, "loss": -0.0001, "num_tokens": 56733.0, "reward": 1.062500238418579, "reward_std": 0.14997151494026184, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 2 }, { "completion_length": 1888.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4456.0, "completions/max_terminated_length": 4456.0, "completions/mean_length": 1888.3333740234375, "completions/mean_terminated_length": 1888.3333740234375, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.0010176390773405698, "frac_reward_zero_std": 0.5, "grad_norm": 0.14695346355438232, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0074, "num_tokens": 89689.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 3 }, { "completion_length": 2936.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5567.0, "completions/mean_length": 3486.0, "completions/mean_terminated_length": 3203.9091796875, "completions/min_length": 1971.0, "completions/min_terminated_length": 1971.0, "epoch": 0.0013568521031207597, "frac_reward_zero_std": 0.5, "grad_norm": 1.444371223449707, "kl": NaN, "learning_rate": 3e-08, "loss": -0.0084, "num_tokens": 135204.0, "reward": 1.0125000476837158, "reward_std": 0.31494051218032837, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 4 }, { "completion_length": 2239.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5995.0, "completions/max_terminated_length": 5995.0, "completions/mean_length": 2239.5, "completions/mean_terminated_length": 2239.5, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 0.0016960651289009499, "frac_reward_zero_std": 0.0, "grad_norm": 1.1211001873016357, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0407, "num_tokens": 174384.0, "reward": 0.6833333373069763, "reward_std": 0.5333091616630554, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 5 }, { "completion_length": 2380.25, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6126.0, "completions/mean_length": 3478.416748046875, "completions/mean_terminated_length": 2856.300048828125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.0020352781546811396, "frac_reward_zero_std": 0.5, "grad_norm": 0.597161054611206, "kl": NaN, "learning_rate": 5e-08, "loss": -0.0538, "num_tokens": 213567.0, "reward": 1.0291666984558105, "reward_std": 0.24208299815654755, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 6 }, { "completion_length": 1254.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 1254.75, "completions/mean_terminated_length": 1254.75, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.0023744911804613297, "frac_reward_zero_std": 0.0, "grad_norm": 0.42129963636398315, "kl": 0.0, "learning_rate": 6e-08, "loss": -0.0015, "num_tokens": 241776.0, "reward": 0.8166667819023132, "reward_std": 0.20202915370464325, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 7 }, { "completion_length": 634.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 634.25, "completions/mean_terminated_length": 634.25, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.0027137042062415195, "frac_reward_zero_std": 0.0, "grad_norm": 0.0805782824754715, "kl": 0.0, "learning_rate": 7e-08, "loss": -0.0005, "num_tokens": 262713.0, "reward": 0.21250002086162567, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 8 }, { "completion_length": 1587.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 1587.166748046875, "completions/mean_terminated_length": 1587.166748046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.0030529172320217096, "frac_reward_zero_std": 0.5, "grad_norm": 0.6700242161750793, "kl": 0.0, "learning_rate": 8e-08, "loss": -0.0088, "num_tokens": 291287.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 9 }, { "completion_length": 1056.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2969.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 1056.416748046875, "completions/mean_terminated_length": 1056.416748046875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.0033921302578018998, "frac_reward_zero_std": 0.0, "grad_norm": 0.37774741649627686, "kl": 0.0, "learning_rate": 9e-08, "loss": -0.0003, "num_tokens": 314290.0, "reward": 0.949999988079071, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.07385490089654922, "step": 10 }, { "completion_length": 3576.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5869.0, "completions/mean_length": 4125.75, "completions/mean_terminated_length": 3901.818359375, "completions/min_length": 1444.0, "completions/min_terminated_length": 1444.0, "epoch": 0.0037313432835820895, "frac_reward_zero_std": 0.0, "grad_norm": 0.22871029376983643, "kl": NaN, "learning_rate": 1e-07, "loss": -0.017, "num_tokens": 372348.0, "reward": 0.6791667342185974, "reward_std": 0.10357433557510376, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 11 }, { "completion_length": 1701.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4018.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1701.916748046875, "completions/mean_terminated_length": 1701.916748046875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.004070556309362279, "frac_reward_zero_std": 0.0, "grad_norm": 0.6973493695259094, "kl": 0.0, "learning_rate": 1.0999999999999999e-07, "loss": -0.004, "num_tokens": 409031.0, "reward": 0.7041667699813843, "reward_std": 0.44965147972106934, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 12 }, { "completion_length": 1128.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1128.75, "completions/mean_terminated_length": 1128.75, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 0.004409769335142469, "frac_reward_zero_std": 0.5, "grad_norm": 0.07278123497962952, "kl": 0.0, "learning_rate": 1.2e-07, "loss": -0.001, "num_tokens": 434738.0, "reward": 0.762499988079071, "reward_std": 0.04107918590307236, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 13 }, { "completion_length": 1370.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3375.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 1370.166748046875, "completions/mean_terminated_length": 1370.166748046875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.0047489823609226595, "frac_reward_zero_std": 0.0, "grad_norm": 0.4310975968837738, "kl": 0.0, "learning_rate": 1.3e-07, "loss": 0.0046, "num_tokens": 464644.0, "reward": 1.0541667938232422, "reward_std": 0.24682849645614624, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 14 }, { "completion_length": 2577.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5694.0, "completions/mean_length": 3675.58349609375, "completions/mean_terminated_length": 3092.900146484375, "completions/min_length": 1257.0, "completions/min_terminated_length": 1257.0, "epoch": 0.00508819538670285, "frac_reward_zero_std": 0.5, "grad_norm": 0.4105437397956848, "kl": NaN, "learning_rate": 1.4e-07, "loss": -0.0759, "num_tokens": 511977.0, "reward": 0.9291667938232422, "reward_std": 0.26571446657180786, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 15 }, { "completion_length": 2431.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4410.0, "completions/max_terminated_length": 4410.0, "completions/mean_length": 2431.25, "completions/mean_terminated_length": 2431.25, "completions/min_length": 1113.0, "completions/min_terminated_length": 1113.0, "epoch": 0.005427408412483039, "frac_reward_zero_std": 0.0, "grad_norm": 2.087972640991211, "kl": 0.0, "learning_rate": 1.5e-07, "loss": -0.0046, "num_tokens": 552924.0, "reward": 0.6625000238418579, "reward_std": 0.26881134510040283, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 16 }, { "completion_length": 1940.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3607.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 1940.5, "completions/mean_terminated_length": 1940.5, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.005766621438263229, "frac_reward_zero_std": 0.0, "grad_norm": 0.6243698000907898, "kl": 0.0, "learning_rate": 1.6e-07, "loss": -0.0143, "num_tokens": 585378.0, "reward": 0.8791667819023132, "reward_std": 0.45129260420799255, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.37618502974510193, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 17 }, { "completion_length": 913.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 913.1666870117188, "completions/mean_terminated_length": 913.1666870117188, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.006105834464043419, "frac_reward_zero_std": 0.0, "grad_norm": 0.548653244972229, "kl": 0.0, "learning_rate": 1.7000000000000001e-07, "loss": 0.012, "num_tokens": 606578.0, "reward": 1.0875000953674316, "reward_std": 0.4288218021392822, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 18 }, { "completion_length": 859.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3215.0, "completions/max_terminated_length": 3215.0, "completions/mean_length": 859.75, "completions/mean_terminated_length": 859.75, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.006445047489823609, "frac_reward_zero_std": 0.0, "grad_norm": 0.5170750617980957, "kl": 0.0, "learning_rate": 1.8e-07, "loss": -0.0076, "num_tokens": 627719.0, "reward": 0.8833333849906921, "reward_std": 0.30571478605270386, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 19 }, { "completion_length": 1683.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3455.0, "completions/max_terminated_length": 3455.0, "completions/mean_length": 1683.75, "completions/mean_terminated_length": 1683.75, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.0067842605156037995, "frac_reward_zero_std": 0.5, "grad_norm": 0.09887401759624481, "kl": 0.0, "learning_rate": 1.8999999999999998e-07, "loss": -0.001, "num_tokens": 657884.0, "reward": 0.762499988079071, "reward_std": 0.041079193353652954, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 20 }, { "completion_length": 2319.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4392.0, "completions/max_terminated_length": 4392.0, "completions/mean_length": 2319.166748046875, "completions/mean_terminated_length": 2319.166748046875, "completions/min_length": 1144.0, "completions/min_terminated_length": 1144.0, "epoch": 0.007123473541383989, "frac_reward_zero_std": 0.0, "grad_norm": 0.4928167760372162, "kl": 0.0, "learning_rate": 2e-07, "loss": 0.0001, "num_tokens": 695722.0, "reward": 1.070833444595337, "reward_std": 0.2486901879310608, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 21 }, { "completion_length": 723.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 723.5833740234375, "completions/mean_terminated_length": 723.5833740234375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.007462686567164179, "frac_reward_zero_std": 0.0, "grad_norm": 0.5503113865852356, "kl": 0.0, "learning_rate": 2.0999999999999997e-07, "loss": -0.0044, "num_tokens": 716675.0, "reward": 0.8333333730697632, "reward_std": 0.5088584423065186, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 22 }, { "completion_length": 1546.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3261.0, "completions/max_terminated_length": 3261.0, "completions/mean_length": 1546.5833740234375, "completions/mean_terminated_length": 1546.5833740234375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.007801899592944369, "frac_reward_zero_std": 0.0, "grad_norm": 0.530082643032074, "kl": 0.0, "learning_rate": 2.1999999999999998e-07, "loss": -0.0047, "num_tokens": 744030.0, "reward": 0.5625, "reward_std": 0.295512855052948, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 23 }, { "completion_length": 3152.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5800.0, "completions/max_terminated_length": 5800.0, "completions/mean_length": 3152.08349609375, "completions/mean_terminated_length": 3152.08349609375, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "epoch": 0.008141112618724558, "frac_reward_zero_std": 1.0, "grad_norm": 5.056385816715192e-07, "kl": 0.0, "learning_rate": 2.3e-07, "loss": 0.0, "num_tokens": 794473.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "completion_length": 540.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 540.8333740234375, "completions/mean_terminated_length": 540.8333740234375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.008480325644504749, "frac_reward_zero_std": 0.5, "grad_norm": 0.24769005179405212, "kl": 0.0, "learning_rate": 2.4e-07, "loss": -0.0003, "num_tokens": 811541.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 25 }, { "completion_length": 1279.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 1279.666748046875, "completions/mean_terminated_length": 1279.666748046875, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.008819538670284939, "frac_reward_zero_std": 0.0, "grad_norm": 0.1748448610305786, "kl": 0.0, "learning_rate": 2.5e-07, "loss": 0.0003, "num_tokens": 836509.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 26 }, { "completion_length": 1812.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1812.5, "completions/mean_terminated_length": 1812.5, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.009158751696065129, "frac_reward_zero_std": 0.5, "grad_norm": 0.10540489852428436, "kl": 0.0, "learning_rate": 2.6e-07, "loss": -0.0019, "num_tokens": 868297.0, "reward": 1.1541666984558105, "reward_std": 0.05103101581335068, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 27 }, { "completion_length": 1048.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 1048.0833740234375, "completions/mean_terminated_length": 1048.0833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.009497964721845319, "frac_reward_zero_std": 0.0, "grad_norm": 0.6303296089172363, "kl": 0.0, "learning_rate": 2.7e-07, "loss": -0.0051, "num_tokens": 891440.0, "reward": 0.9958333373069763, "reward_std": 0.41863998770713806, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 28 }, { "completion_length": 1568.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3112.0, "completions/max_terminated_length": 3112.0, "completions/mean_length": 1568.0833740234375, "completions/mean_terminated_length": 1568.0833740234375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.00983717774762551, "frac_reward_zero_std": 0.0, "grad_norm": 0.7948746085166931, "kl": 0.0, "learning_rate": 2.8e-07, "loss": 0.0124, "num_tokens": 922821.0, "reward": 0.783333420753479, "reward_std": 0.45408618450164795, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 29 }, { "completion_length": 1348.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 1348.0, "completions/mean_terminated_length": 1348.0, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.0101763907734057, "frac_reward_zero_std": 0.0, "grad_norm": 0.48874858021736145, "kl": 0.0, "learning_rate": 2.9e-07, "loss": -0.0019, "num_tokens": 948873.0, "reward": 0.8166667819023132, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "completion_length": 699.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 699.8333740234375, "completions/mean_terminated_length": 699.8333740234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.01051560379918589, "frac_reward_zero_std": 0.5, "grad_norm": 0.5726268291473389, "kl": 0.0, "learning_rate": 3e-07, "loss": -0.0029, "num_tokens": 971413.0, "reward": 0.887499988079071, "reward_std": 0.26016825437545776, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.06784005463123322, "step": 31 }, { "completion_length": 3761.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5482.0, "completions/max_terminated_length": 5482.0, "completions/mean_length": 3761.58349609375, "completions/mean_terminated_length": 3761.58349609375, "completions/min_length": 1546.0, "completions/min_terminated_length": 1546.0, "epoch": 0.010854816824966078, "frac_reward_zero_std": 0.0, "grad_norm": 1.1386401653289795, "kl": 0.0, "learning_rate": 3.1e-07, "loss": -0.0138, "num_tokens": 1028342.0, "reward": 0.7250000834465027, "reward_std": 0.4932064414024353, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577691078186035, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 32 }, { "completion_length": 1955.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3581.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 1955.3333740234375, "completions/mean_terminated_length": 1955.3333740234375, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.011194029850746268, "frac_reward_zero_std": 0.5, "grad_norm": 0.5336142778396606, "kl": 0.0, "learning_rate": 3.2e-07, "loss": 0.0114, "num_tokens": 1064280.0, "reward": 1.0375001430511475, "reward_std": 0.20600366592407227, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 33 }, { "completion_length": 2596.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5696.0, "completions/max_terminated_length": 5696.0, "completions/mean_length": 2596.08349609375, "completions/mean_terminated_length": 2596.08349609375, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.011533242876526458, "frac_reward_zero_std": 0.5, "grad_norm": 0.12288849800825119, "kl": 0.0, "learning_rate": 3.3e-07, "loss": 0.0018, "num_tokens": 1106305.0, "reward": 0.6750000715255737, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 34 }, { "completion_length": 762.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 762.0833740234375, "completions/mean_terminated_length": 762.0833740234375, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.011872455902306648, "frac_reward_zero_std": 0.0, "grad_norm": 0.3861258625984192, "kl": 0.0, "learning_rate": 3.4000000000000003e-07, "loss": -0.0014, "num_tokens": 1127090.0, "reward": 0.595833420753479, "reward_std": 0.23264777660369873, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 35 }, { "completion_length": 1244.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 1244.5, "completions/mean_terminated_length": 1244.5, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.012211668928086838, "frac_reward_zero_std": 0.5, "grad_norm": 0.5422102212905884, "kl": 0.0, "learning_rate": 3.5e-07, "loss": 0.0022, "num_tokens": 1153376.0, "reward": 0.8583333492279053, "reward_std": 0.21946904063224792, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 36 }, { "completion_length": 1497.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3081.0, "completions/max_terminated_length": 3081.0, "completions/mean_length": 1497.5833740234375, "completions/mean_terminated_length": 1497.5833740234375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.012550881953867029, "frac_reward_zero_std": 0.5, "grad_norm": 0.6115618348121643, "kl": 0.0, "learning_rate": 3.6e-07, "loss": 0.0252, "num_tokens": 1183983.0, "reward": 0.9916667938232422, "reward_std": 0.2866472899913788, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 37 }, { "completion_length": 1688.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5040.0, "completions/max_terminated_length": 5040.0, "completions/mean_length": 1688.75, "completions/mean_terminated_length": 1688.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.012890094979647219, "frac_reward_zero_std": 0.5, "grad_norm": 0.9230490922927856, "kl": 0.0, "learning_rate": 3.7e-07, "loss": -0.005, "num_tokens": 1215972.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 38 }, { "completion_length": 1880.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3490.0, "completions/max_terminated_length": 3490.0, "completions/mean_length": 1880.5833740234375, "completions/mean_terminated_length": 1880.5833740234375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.013229308005427409, "frac_reward_zero_std": 0.5, "grad_norm": 0.08842471987009048, "kl": 0.0, "learning_rate": 3.7999999999999996e-07, "loss": 0.0011, "num_tokens": 1250587.0, "reward": 1.1875, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 39 }, { "completion_length": 1623.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3097.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 1623.5833740234375, "completions/mean_terminated_length": 1623.5833740234375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.013568521031207599, "frac_reward_zero_std": 0.0, "grad_norm": 0.7013817429542542, "kl": 0.0, "learning_rate": 3.8999999999999997e-07, "loss": 0.0118, "num_tokens": 1284230.0, "reward": 1.0250000953674316, "reward_std": 0.42866072058677673, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 40 }, { "completion_length": 1975.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 2524.416748046875, "completions/mean_terminated_length": 2154.9091796875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.013907734056987787, "frac_reward_zero_std": 0.0, "grad_norm": 0.6931108236312866, "kl": NaN, "learning_rate": 4e-07, "loss": -0.0256, "num_tokens": 1318170.0, "reward": 0.7166666984558105, "reward_std": 0.4313082695007324, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 41 }, { "completion_length": 1261.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 1261.416748046875, "completions/mean_terminated_length": 1261.416748046875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.014246947082767978, "frac_reward_zero_std": 0.5, "grad_norm": 0.532822847366333, "kl": 0.0, "learning_rate": 4.0999999999999994e-07, "loss": 0.0143, "num_tokens": 1341893.0, "reward": 1.0875000953674316, "reward_std": 0.23008152842521667, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 42 }, { "completion_length": 1444.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4008.0, "completions/mean_length": 1993.3333740234375, "completions/mean_terminated_length": 1575.5455322265625, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.014586160108548168, "frac_reward_zero_std": 0.5, "grad_norm": 0.7497459650039673, "kl": NaN, "learning_rate": 4.1999999999999995e-07, "loss": -0.0279, "num_tokens": 1370816.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 43 }, { "completion_length": 1885.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4084.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1885.0833740234375, "completions/mean_terminated_length": 1885.0833740234375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.014925373134328358, "frac_reward_zero_std": 0.5, "grad_norm": 0.11372081935405731, "kl": 0.0, "learning_rate": 4.2999999999999996e-07, "loss": -0.0021, "num_tokens": 1403379.0, "reward": 0.6625000834465027, "reward_std": 0.041079193353652954, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 44 }, { "completion_length": 967.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 967.0, "completions/mean_terminated_length": 967.0, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.015264586160108548, "frac_reward_zero_std": 0.0, "grad_norm": 0.4614796042442322, "kl": 0.0, "learning_rate": 4.3999999999999997e-07, "loss": -0.0099, "num_tokens": 1427145.0, "reward": 0.5708333849906921, "reward_std": 0.23264777660369873, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 45 }, { "completion_length": 1987.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5091.0, "completions/max_terminated_length": 5091.0, "completions/mean_length": 1987.5833740234375, "completions/mean_terminated_length": 1987.5833740234375, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.015603799185888738, "frac_reward_zero_std": 0.5, "grad_norm": 0.45858681201934814, "kl": 0.0, "learning_rate": 4.5e-07, "loss": -0.0061, "num_tokens": 1462708.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 46 }, { "completion_length": 2157.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3806.0, "completions/mean_length": 2706.08349609375, "completions/mean_terminated_length": 2353.091064453125, "completions/min_length": 1247.0, "completions/min_terminated_length": 1247.0, "epoch": 0.01594301221166893, "frac_reward_zero_std": 0.0, "grad_norm": 0.5918885469436646, "kl": NaN, "learning_rate": 4.6e-07, "loss": -0.0191, "num_tokens": 1500250.0, "reward": 0.9625000953674316, "reward_std": 0.4404165744781494, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 47 }, { "completion_length": 2361.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6359.0, "completions/max_terminated_length": 6359.0, "completions/mean_length": 2361.166748046875, "completions/mean_terminated_length": 2361.166748046875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.016282225237449117, "frac_reward_zero_std": 0.0, "grad_norm": 8.699708938598633, "kl": 0.0, "learning_rate": 4.6999999999999995e-07, "loss": -0.0034, "num_tokens": 1538424.0, "reward": 0.6791667342185974, "reward_std": 0.08190402388572693, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 48 }, { "completion_length": 3042.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6276.0, "completions/mean_length": 3591.666748046875, "completions/mean_terminated_length": 3319.181884765625, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 0.01662143826322931, "frac_reward_zero_std": 0.0, "grad_norm": 0.9630971550941467, "kl": NaN, "learning_rate": 4.8e-07, "loss": -0.0544, "num_tokens": 1590673.0, "reward": 0.7124999761581421, "reward_std": 0.49692243337631226, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 49 }, { "completion_length": 930.0000305175781, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 4224.5, "completions/mean_terminated_length": 1860.0, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.016960651289009497, "frac_reward_zero_std": 0.5, "grad_norm": 0.09431289881467819, "kl": NaN, "learning_rate": 4.9e-07, "loss": -0.0007, "num_tokens": 1614277.0, "reward": 0.5666667222976685, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 50 }, { "completion_length": 1810.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3579.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 1810.25, "completions/mean_terminated_length": 1810.25, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.01729986431478969, "frac_reward_zero_std": 0.0, "grad_norm": 0.6033844947814941, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0042, "num_tokens": 1650094.0, "reward": 0.9291666746139526, "reward_std": 0.38409334421157837, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 51 }, { "completion_length": 3461.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5404.0, "completions/max_terminated_length": 5404.0, "completions/mean_length": 3461.25, "completions/mean_terminated_length": 3461.25, "completions/min_length": 2344.0, "completions/min_terminated_length": 2344.0, "epoch": 0.017639077340569877, "frac_reward_zero_std": 0.0, "grad_norm": 0.9066393971443176, "kl": 0.0, "learning_rate": 4.998274672187715e-07, "loss": 0.0169, "num_tokens": 1707199.0, "reward": 0.8625000715255737, "reward_std": 0.43920692801475525, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 52 }, { "completion_length": 621.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 621.6666870117188, "completions/mean_terminated_length": 621.6666870117188, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.01797829036635007, "frac_reward_zero_std": 0.0, "grad_norm": 0.07819852232933044, "kl": 0.0, "learning_rate": 4.996549344375431e-07, "loss": -0.0013, "num_tokens": 1725603.0, "reward": 0.7124999761581421, "reward_std": 0.06934843957424164, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 53 }, { "completion_length": 2232.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5477.0, "completions/mean_length": 3330.166748046875, "completions/mean_terminated_length": 2678.400146484375, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.018317503392130258, "frac_reward_zero_std": 0.0, "grad_norm": 0.5813143849372864, "kl": NaN, "learning_rate": 4.994824016563146e-07, "loss": -0.0049, "num_tokens": 1765119.0, "reward": 0.5583333373069763, "reward_std": 0.23038136959075928, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.11965861171483994, "step": 54 }, { "completion_length": 529.9166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 529.9166870117188, "completions/mean_terminated_length": 529.9166870117188, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.018656716417910446, "frac_reward_zero_std": 0.5, "grad_norm": 0.04094817116856575, "kl": 0.0, "learning_rate": 4.993098688750863e-07, "loss": 0.0001, "num_tokens": 1786892.0, "reward": 1.1375000476837158, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 55 }, { "completion_length": 2408.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4855.0, "completions/max_terminated_length": 4855.0, "completions/mean_length": 2408.58349609375, "completions/mean_terminated_length": 2408.58349609375, "completions/min_length": 1078.0, "completions/min_terminated_length": 1078.0, "epoch": 0.018995929443690638, "frac_reward_zero_std": 0.5, "grad_norm": 0.11791203916072845, "kl": 0.0, "learning_rate": 4.991373360938578e-07, "loss": 0.0014, "num_tokens": 1823361.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 56 }, { "completion_length": 499.4166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 499.41668701171875, "completions/mean_terminated_length": 499.41668701171875, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.019335142469470826, "frac_reward_zero_std": 1.0, "grad_norm": 2.9338906415432575e-07, "kl": 0.0, "learning_rate": 4.989648033126294e-07, "loss": 0.0, "num_tokens": 1837562.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 57 }, { "completion_length": 1457.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3312.0, "completions/max_terminated_length": 3312.0, "completions/mean_length": 1457.5, "completions/mean_terminated_length": 1457.5, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.01967435549525102, "frac_reward_zero_std": 0.5, "grad_norm": 0.41315922141075134, "kl": 0.0, "learning_rate": 4.98792270531401e-07, "loss": 0.0077, "num_tokens": 1867334.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 58 }, { "completion_length": 2239.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4822.0, "completions/mean_length": 2788.75, "completions/mean_terminated_length": 2443.272705078125, "completions/min_length": 1249.0, "completions/min_terminated_length": 1249.0, "epoch": 0.020013568521031207, "frac_reward_zero_std": 0.0, "grad_norm": 0.6937410831451416, "kl": NaN, "learning_rate": 4.986197377501725e-07, "loss": -0.0354, "num_tokens": 1910866.0, "reward": 0.6125000715255737, "reward_std": 0.26673299074172974, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 59 }, { "completion_length": 740.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 740.25, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.0203527815468114, "frac_reward_zero_std": 0.0, "grad_norm": 0.09556394070386887, "kl": 0.0, "learning_rate": 4.984472049689441e-07, "loss": -0.0012, "num_tokens": 1933849.0, "reward": 1.2208333015441895, "reward_std": 0.07144342362880707, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 60 }, { "completion_length": 546.4166717529297, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 3840.916748046875, "completions/mean_terminated_length": 1092.8333740234375, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.020691994572591587, "frac_reward_zero_std": 0.0, "grad_norm": 0.37896645069122314, "kl": NaN, "learning_rate": 4.982746721877156e-07, "loss": 0.006, "num_tokens": 1950060.0, "reward": 0.4833333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.1430193930864334, "step": 61 }, { "completion_length": 958.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 958.3333740234375, "completions/mean_terminated_length": 958.3333740234375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.02103120759837178, "frac_reward_zero_std": 0.5, "grad_norm": 0.059960030019283295, "kl": 0.0, "learning_rate": 4.981021394064872e-07, "loss": 0.0007, "num_tokens": 1975876.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 62 }, { "completion_length": 1011.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1011.5, "completions/mean_terminated_length": 1011.5, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.021370420624151967, "frac_reward_zero_std": 1.0, "grad_norm": 3.14549453150903e-07, "kl": 0.0, "learning_rate": 4.979296066252588e-07, "loss": 0.0, "num_tokens": 2003572.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 63 }, { "completion_length": 2115.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4262.0, "completions/max_terminated_length": 4262.0, "completions/mean_length": 2115.58349609375, "completions/mean_terminated_length": 2115.58349609375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.021709633649932156, "frac_reward_zero_std": 1.0, "grad_norm": 2.780624583920144e-07, "kl": 0.0, "learning_rate": 4.977570738440303e-07, "loss": 0.0, "num_tokens": 2040947.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 64 }, { "completion_length": 5106.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 6488.0, "completions/max_terminated_length": 6488.0, "completions/mean_length": 5106.33349609375, "completions/mean_terminated_length": 5106.33349609375, "completions/min_length": 2990.0, "completions/min_terminated_length": 2990.0, "epoch": 0.022048846675712348, "frac_reward_zero_std": 0.0, "grad_norm": 1.3122785091400146, "kl": 0.0, "learning_rate": 4.975845410628019e-07, "loss": -0.008, "num_tokens": 2112147.0, "reward": 0.7666666507720947, "reward_std": 0.521875262260437, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 65 }, { "completion_length": 2463.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4120.0, "completions/max_terminated_length": 4120.0, "completions/mean_length": 2463.916748046875, "completions/mean_terminated_length": 2463.916748046875, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.022388059701492536, "frac_reward_zero_std": 0.0, "grad_norm": 0.8168641924858093, "kl": 0.0, "learning_rate": 4.974120082815735e-07, "loss": -0.0017, "num_tokens": 2156594.0, "reward": 0.6208333969116211, "reward_std": 0.44094666838645935, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 66 }, { "completion_length": 2151.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4150.0, "completions/max_terminated_length": 4150.0, "completions/mean_length": 2151.166748046875, "completions/mean_terminated_length": 2151.166748046875, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.022727272727272728, "frac_reward_zero_std": 0.0, "grad_norm": 0.6010086536407471, "kl": 0.0, "learning_rate": 4.97239475500345e-07, "loss": -0.0047, "num_tokens": 2194228.0, "reward": 1.0, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 67 }, { "completion_length": 690.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 690.75, "completions/mean_terminated_length": 690.75, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.023066485753052916, "frac_reward_zero_std": 0.5, "grad_norm": 0.03490245342254639, "kl": 0.0, "learning_rate": 4.970669427191166e-07, "loss": -0.0001, "num_tokens": 2215375.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 68 }, { "completion_length": 2392.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4364.0, "completions/max_terminated_length": 4364.0, "completions/mean_length": 2392.83349609375, "completions/mean_terminated_length": 2392.83349609375, "completions/min_length": 1438.0, "completions/min_terminated_length": 1438.0, "epoch": 0.023405698778833108, "frac_reward_zero_std": 0.0, "grad_norm": 0.14046727120876312, "kl": 0.0, "learning_rate": 4.968944099378881e-07, "loss": 0.0027, "num_tokens": 2255921.0, "reward": 0.6916667819023132, "reward_std": 0.07955466210842133, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 69 }, { "completion_length": 3071.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6239.0, "completions/max_terminated_length": 6239.0, "completions/mean_length": 3071.83349609375, "completions/mean_terminated_length": 3071.83349609375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.023744911804613297, "frac_reward_zero_std": 0.5, "grad_norm": 0.15312537550926208, "kl": 0.0, "learning_rate": 4.967218771566598e-07, "loss": -0.0006, "num_tokens": 2304747.0, "reward": 0.27500003576278687, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 70 }, { "completion_length": 2039.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4137.0, "completions/max_terminated_length": 4137.0, "completions/mean_length": 2039.166748046875, "completions/mean_terminated_length": 2039.166748046875, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.02408412483039349, "frac_reward_zero_std": 0.0, "grad_norm": 0.6391112208366394, "kl": 0.0, "learning_rate": 4.965493443754314e-07, "loss": 0.0104, "num_tokens": 2340371.0, "reward": 1.1375000476837158, "reward_std": 0.2848537564277649, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 71 }, { "completion_length": 2516.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4060.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2516.08349609375, "completions/mean_terminated_length": 2516.08349609375, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.024423337856173677, "frac_reward_zero_std": 0.0, "grad_norm": 0.47134825587272644, "kl": 0.0, "learning_rate": 4.963768115942029e-07, "loss": -0.0085, "num_tokens": 2383092.0, "reward": 0.8416666984558105, "reward_std": 0.20202915370464325, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 72 }, { "completion_length": 2631.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5089.0, "completions/max_terminated_length": 5089.0, "completions/mean_length": 2631.83349609375, "completions/mean_terminated_length": 2631.83349609375, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "epoch": 0.024762550881953865, "frac_reward_zero_std": 0.0, "grad_norm": 1.5083867311477661, "kl": 0.0, "learning_rate": 4.962042788129745e-07, "loss": 0.0137, "num_tokens": 2426014.0, "reward": 0.9833334684371948, "reward_std": 0.2473839521408081, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 73 }, { "completion_length": 1854.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 1854.166748046875, "completions/mean_terminated_length": 1854.166748046875, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.025101763907734057, "frac_reward_zero_std": 0.5, "grad_norm": 0.07933083176612854, "kl": 0.0, "learning_rate": 4.96031746031746e-07, "loss": 0.001, "num_tokens": 2461752.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 74 }, { "completion_length": 1426.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3497.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 1426.666748046875, "completions/mean_terminated_length": 1426.666748046875, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.025440976933514246, "frac_reward_zero_std": 1.0, "grad_norm": 2.802689778036438e-07, "kl": 0.0, "learning_rate": 4.958592132505176e-07, "loss": 0.0, "num_tokens": 2487038.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 75 }, { "completion_length": 1519.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3167.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 1519.916748046875, "completions/mean_terminated_length": 1519.916748046875, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.025780189959294438, "frac_reward_zero_std": 0.5, "grad_norm": 0.08159344643354416, "kl": 0.0, "learning_rate": 4.956866804692891e-07, "loss": 0.0011, "num_tokens": 2513005.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 76 }, { "completion_length": 1891.0834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3705.0, "completions/mean_length": 2440.166748046875, "completions/mean_terminated_length": 2063.0, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.026119402985074626, "frac_reward_zero_std": 0.5, "grad_norm": 0.5463436245918274, "kl": NaN, "learning_rate": 4.955141476880607e-07, "loss": -0.014, "num_tokens": 2549246.0, "reward": 0.8500000834465027, "reward_std": 0.2752271890640259, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 77 }, { "completion_length": 2267.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4360.0, "completions/max_terminated_length": 4360.0, "completions/mean_length": 2267.58349609375, "completions/mean_terminated_length": 2267.58349609375, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 0.026458616010854818, "frac_reward_zero_std": 0.0, "grad_norm": 0.3025602400302887, "kl": 0.0, "learning_rate": 4.953416149068323e-07, "loss": 0.0118, "num_tokens": 2588073.0, "reward": 1.1583333015441895, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 78 }, { "completion_length": 1166.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4041.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 1166.0, "completions/mean_terminated_length": 1166.0, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.026797829036635006, "frac_reward_zero_std": 0.5, "grad_norm": 0.4117281138896942, "kl": 0.0, "learning_rate": 4.951690821256038e-07, "loss": 0.0011, "num_tokens": 2612667.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 79 }, { "completion_length": 2583.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4004.0, "completions/max_terminated_length": 4004.0, "completions/mean_length": 2583.83349609375, "completions/mean_terminated_length": 2583.83349609375, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.027137042062415198, "frac_reward_zero_std": 0.0, "grad_norm": 0.5607545971870422, "kl": 0.0, "learning_rate": 4.949965493443754e-07, "loss": 0.0271, "num_tokens": 2658175.0, "reward": 1.0, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.7000000476837158, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 80 }, { "completion_length": 1255.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 1255.3333740234375, "completions/mean_terminated_length": 1255.3333740234375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.027476255088195387, "frac_reward_zero_std": 0.5, "grad_norm": 0.32516950368881226, "kl": 0.0, "learning_rate": 4.94824016563147e-07, "loss": -0.0018, "num_tokens": 2681411.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 81 }, { "completion_length": 2111.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5036.0, "completions/max_terminated_length": 5036.0, "completions/mean_length": 2111.25, "completions/mean_terminated_length": 2111.25, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.027815468113975575, "frac_reward_zero_std": 0.0, "grad_norm": 0.35042816400527954, "kl": 0.0, "learning_rate": 4.946514837819185e-07, "loss": 0.0032, "num_tokens": 2717276.0, "reward": 1.0916666984558105, "reward_std": 0.25535523891448975, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 82 }, { "completion_length": 2550.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5000.0, "completions/mean_length": 3099.25, "completions/mean_terminated_length": 2782.0, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "epoch": 0.028154681139755767, "frac_reward_zero_std": 0.0, "grad_norm": 1.0620650053024292, "kl": NaN, "learning_rate": 4.944789510006901e-07, "loss": -0.0267, "num_tokens": 2759404.0, "reward": 0.7916666865348816, "reward_std": 0.4943132996559143, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 83 }, { "completion_length": 2253.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4483.0, "completions/mean_length": 2802.58349609375, "completions/mean_terminated_length": 2458.36376953125, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 0.028493894165535955, "frac_reward_zero_std": 0.5, "grad_norm": 0.1895497888326645, "kl": NaN, "learning_rate": 4.943064182194616e-07, "loss": -0.0107, "num_tokens": 2801404.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 84 }, { "completion_length": 2357.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4887.0, "completions/mean_length": 2906.166748046875, "completions/mean_terminated_length": 2571.36376953125, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "epoch": 0.028833107191316147, "frac_reward_zero_std": 0.0, "grad_norm": 0.8943297863006592, "kl": NaN, "learning_rate": 4.941338854382333e-07, "loss": -0.0042, "num_tokens": 2841851.0, "reward": 0.9291667342185974, "reward_std": 0.4828321933746338, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 85 }, { "completion_length": 2335.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4581.0, "completions/max_terminated_length": 4581.0, "completions/mean_length": 2335.75, "completions/mean_terminated_length": 2335.75, "completions/min_length": 1136.0, "completions/min_terminated_length": 1136.0, "epoch": 0.029172320217096336, "frac_reward_zero_std": 0.5, "grad_norm": 0.6140693426132202, "kl": 0.0, "learning_rate": 4.939613526570047e-07, "loss": -0.0082, "num_tokens": 2883254.0, "reward": 1.1041667461395264, "reward_std": 0.2685222029685974, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 86 }, { "completion_length": 1397.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3341.0, "completions/max_terminated_length": 3341.0, "completions/mean_length": 1397.666748046875, "completions/mean_terminated_length": 1397.666748046875, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.029511533242876527, "frac_reward_zero_std": 0.0, "grad_norm": 0.753872275352478, "kl": 0.0, "learning_rate": 4.937888198757764e-07, "loss": -0.032, "num_tokens": 2910766.0, "reward": 0.9625000357627869, "reward_std": 0.3697127103805542, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 87 }, { "completion_length": 1930.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3123.0, "completions/max_terminated_length": 3123.0, "completions/mean_length": 1930.666748046875, "completions/mean_terminated_length": 1930.666748046875, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.029850746268656716, "frac_reward_zero_std": 0.5, "grad_norm": 0.6493222713470459, "kl": 0.0, "learning_rate": 4.93616287094548e-07, "loss": -0.0047, "num_tokens": 2943504.0, "reward": 0.8291666507720947, "reward_std": 0.18534879386425018, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 88 }, { "completion_length": 1324.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3149.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 1324.75, "completions/mean_terminated_length": 1324.75, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.030189959294436908, "frac_reward_zero_std": 0.5, "grad_norm": 0.42489466071128845, "kl": 0.0, "learning_rate": 4.934437543133195e-07, "loss": -0.0001, "num_tokens": 2974185.0, "reward": 0.6208333969116211, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 89 }, { "completion_length": 1459.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 1459.0833740234375, "completions/mean_terminated_length": 1459.0833740234375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.030529172320217096, "frac_reward_zero_std": 0.5, "grad_norm": 0.0769336149096489, "kl": 0.0, "learning_rate": 4.932712215320911e-07, "loss": -0.0001, "num_tokens": 3005680.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 90 }, { "completion_length": 763.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 763.0833740234375, "completions/mean_terminated_length": 763.0833740234375, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.030868385345997285, "frac_reward_zero_std": 0.0, "grad_norm": 0.07657662034034729, "kl": 0.0, "learning_rate": 4.930986887508626e-07, "loss": -0.0011, "num_tokens": 3029669.0, "reward": 0.7041667699813843, "reward_std": 0.07144345343112946, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 91 }, { "completion_length": 777.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 777.0, "completions/mean_terminated_length": 777.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.031207598371777476, "frac_reward_zero_std": 1.0, "grad_norm": 8.553340791195296e-08, "kl": 0.0, "learning_rate": 4.929261559696342e-07, "loss": 0.0, "num_tokens": 3051119.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 92 }, { "completion_length": 2085.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5398.0, "completions/max_terminated_length": 5398.0, "completions/mean_length": 2085.75, "completions/mean_terminated_length": 2085.75, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.03154681139755767, "frac_reward_zero_std": 0.5, "grad_norm": 0.419355571269989, "kl": 0.0, "learning_rate": 4.927536231884058e-07, "loss": 0.0014, "num_tokens": 3089666.0, "reward": 0.8958333730697632, "reward_std": 0.298921674489975, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 93 }, { "completion_length": 2719.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4738.0, "completions/max_terminated_length": 4738.0, "completions/mean_length": 2719.33349609375, "completions/mean_terminated_length": 2719.33349609375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.03188602442333786, "frac_reward_zero_std": 0.0, "grad_norm": 0.419344425201416, "kl": 0.0, "learning_rate": 4.925810904071773e-07, "loss": 0.0041, "num_tokens": 3131586.0, "reward": 0.6083334684371948, "reward_std": 0.1906316578388214, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 94 }, { "completion_length": 2132.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5472.0, "completions/max_terminated_length": 5472.0, "completions/mean_length": 2132.25, "completions/mean_terminated_length": 2132.25, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.032225237449118045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3389549255371094, "kl": 0.0, "learning_rate": 4.924085576259489e-07, "loss": 0.0064, "num_tokens": 3170469.0, "reward": 1.0875000953674316, "reward_std": 0.24555771052837372, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 95 }, { "completion_length": 1292.5000305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 1841.5833740234375, "completions/mean_terminated_length": 1410.0, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.032564450474898234, "frac_reward_zero_std": 0.0, "grad_norm": 0.10573876649141312, "kl": NaN, "learning_rate": 4.922360248447205e-07, "loss": -0.0043, "num_tokens": 3200067.0, "reward": 0.6375000476837158, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 96 }, { "completion_length": 1452.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1452.0, "completions/mean_terminated_length": 1452.0, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.03290366350067843, "frac_reward_zero_std": 0.5, "grad_norm": 0.44858041405677795, "kl": 0.0, "learning_rate": 4.92063492063492e-07, "loss": 0.0015, "num_tokens": 3228213.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 97 }, { "completion_length": 2012.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6447.0, "completions/max_terminated_length": 6447.0, "completions/mean_length": 2012.0, "completions/mean_terminated_length": 2012.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.03324287652645862, "frac_reward_zero_std": 0.0, "grad_norm": 0.17230212688446045, "kl": 0.0, "learning_rate": 4.918909592822636e-07, "loss": 0.0018, "num_tokens": 3265227.0, "reward": 1.120833396911621, "reward_std": 0.10867881774902344, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 98 }, { "completion_length": 2578.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4870.0, "completions/max_terminated_length": 4870.0, "completions/mean_length": 2578.166748046875, "completions/mean_terminated_length": 2578.166748046875, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.033582089552238806, "frac_reward_zero_std": 0.5, "grad_norm": 0.5569886565208435, "kl": 0.0, "learning_rate": 4.917184265010351e-07, "loss": -0.0017, "num_tokens": 3313511.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 99 }, { "completion_length": 715.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 715.3333740234375, "completions/mean_terminated_length": 715.3333740234375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.033921302578018994, "frac_reward_zero_std": 0.5, "grad_norm": 0.06762003153562546, "kl": 0.0, "learning_rate": 4.915458937198068e-07, "loss": -0.0004, "num_tokens": 3333903.0, "reward": 1.1625001430511475, "reward_std": 0.04107918590307236, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 100 }, { "completion_length": 2714.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6070.0, "completions/max_terminated_length": 6070.0, "completions/mean_length": 2714.916748046875, "completions/mean_terminated_length": 2714.916748046875, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.03426051560379918, "frac_reward_zero_std": 0.0, "grad_norm": 0.32408711314201355, "kl": 0.0, "learning_rate": 4.913733609385783e-07, "loss": -0.0026, "num_tokens": 3380318.0, "reward": 0.7000000476837158, "reward_std": 0.09350207448005676, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 101 }, { "completion_length": 669.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 669.0, "completions/mean_terminated_length": 669.0, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.03459972862957938, "frac_reward_zero_std": 0.5, "grad_norm": 0.05602804571390152, "kl": 0.0, "learning_rate": 4.912008281573499e-07, "loss": -0.0003, "num_tokens": 3398468.0, "reward": 1.0750000476837158, "reward_std": 0.038729824125766754, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 102 }, { "completion_length": 2353.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6165.0, "completions/max_terminated_length": 6165.0, "completions/mean_length": 2353.166748046875, "completions/mean_terminated_length": 2353.166748046875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.034938941655359566, "frac_reward_zero_std": 0.5, "grad_norm": 0.4930708408355713, "kl": 0.0, "learning_rate": 4.910282953761215e-07, "loss": 0.043, "num_tokens": 3432304.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 103 }, { "completion_length": 2527.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4175.0, "completions/max_terminated_length": 4175.0, "completions/mean_length": 2527.5, "completions/mean_terminated_length": 2527.5, "completions/min_length": 1459.0, "completions/min_terminated_length": 1459.0, "epoch": 0.035278154681139755, "frac_reward_zero_std": 0.5, "grad_norm": 0.2505856454372406, "kl": 0.0, "learning_rate": 4.90855762594893e-07, "loss": 0.0011, "num_tokens": 3473278.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 104 }, { "completion_length": 1665.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5170.0, "completions/mean_length": 2214.416748046875, "completions/mean_terminated_length": 1816.727294921875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.03561736770691994, "frac_reward_zero_std": 0.0, "grad_norm": 0.7407315969467163, "kl": NaN, "learning_rate": 4.906832298136646e-07, "loss": -0.0215, "num_tokens": 3503390.0, "reward": 0.783333420753479, "reward_std": 0.4725285470485687, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 105 }, { "completion_length": 3103.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4550.0, "completions/max_terminated_length": 4550.0, "completions/mean_length": 3103.25, "completions/mean_terminated_length": 3103.25, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "epoch": 0.03595658073270014, "frac_reward_zero_std": 0.5, "grad_norm": 0.6687279343605042, "kl": 0.0, "learning_rate": 4.905106970324361e-07, "loss": 0.0069, "num_tokens": 3552125.0, "reward": 0.8375000953674316, "reward_std": 0.2458404153585434, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 106 }, { "completion_length": 1616.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3833.0, "completions/max_terminated_length": 3833.0, "completions/mean_length": 1616.416748046875, "completions/mean_terminated_length": 1616.416748046875, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 0.03629579375848033, "frac_reward_zero_std": 0.0, "grad_norm": 0.5577266812324524, "kl": 0.0, "learning_rate": 4.903381642512077e-07, "loss": 0.0133, "num_tokens": 3581374.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 107 }, { "completion_length": 1808.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3992.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 1808.25, "completions/mean_terminated_length": 1808.25, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.036635006784260515, "frac_reward_zero_std": 0.5, "grad_norm": 0.0762360617518425, "kl": 0.0, "learning_rate": 4.901656314699793e-07, "loss": 0.0011, "num_tokens": 3614665.0, "reward": 1.254166603088379, "reward_std": 0.05103101581335068, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 108 }, { "completion_length": 1865.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3235.0, "completions/max_terminated_length": 3235.0, "completions/mean_length": 1865.8333740234375, "completions/mean_terminated_length": 1865.8333740234375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.036974219810040704, "frac_reward_zero_std": 0.5, "grad_norm": 0.45066243410110474, "kl": 0.0, "learning_rate": 4.899930986887508e-07, "loss": 0.0067, "num_tokens": 3649031.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 109 }, { "completion_length": 1545.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3815.0, "completions/max_terminated_length": 3815.0, "completions/mean_length": 1545.3333740234375, "completions/mean_terminated_length": 1545.3333740234375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.03731343283582089, "frac_reward_zero_std": 0.0, "grad_norm": 0.9093514680862427, "kl": 0.0, "learning_rate": 4.898205659075224e-07, "loss": 0.0008, "num_tokens": 3681831.0, "reward": 1.0250000953674316, "reward_std": 0.3382870554924011, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 110 }, { "completion_length": 2410.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5848.0, "completions/max_terminated_length": 5848.0, "completions/mean_length": 2410.916748046875, "completions/mean_terminated_length": 2410.916748046875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.03765264586160109, "frac_reward_zero_std": 0.0, "grad_norm": 0.689710795879364, "kl": 0.0, "learning_rate": 4.89648033126294e-07, "loss": 0.0, "num_tokens": 3725348.0, "reward": 0.9833333492279053, "reward_std": 0.3129711151123047, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 111 }, { "completion_length": 1068.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 1068.666748046875, "completions/mean_terminated_length": 1068.666748046875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.037991858887381276, "frac_reward_zero_std": 0.5, "grad_norm": 0.5011056661605835, "kl": 0.0, "learning_rate": 4.894755003450655e-07, "loss": 0.0033, "num_tokens": 3749494.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 112 }, { "completion_length": 1666.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4317.0, "completions/max_terminated_length": 4317.0, "completions/mean_length": 1666.5833740234375, "completions/mean_terminated_length": 1666.5833740234375, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.038331071913161464, "frac_reward_zero_std": 0.5, "grad_norm": 0.4090554416179657, "kl": 0.0, "learning_rate": 4.893029675638371e-07, "loss": 0.0057, "num_tokens": 3779723.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 113 }, { "completion_length": 2164.5000610351562, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5738.0, "completions/mean_length": 4360.83349609375, "completions/mean_terminated_length": 3246.75, "completions/min_length": 1838.0, "completions/min_terminated_length": 1838.0, "epoch": 0.03867028493894165, "frac_reward_zero_std": 0.0, "grad_norm": 0.8448308706283569, "kl": NaN, "learning_rate": 4.891304347826087e-07, "loss": -0.0637, "num_tokens": 3821321.0, "reward": 0.3499999940395355, "reward_std": 0.4247293472290039, "rewards/correctness_reward_func/mean": 0.14999999105930328, "rewards/correctness_reward_func/std": 0.35290998220443726, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 114 }, { "completion_length": 1456.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 1456.25, "completions/mean_terminated_length": 1456.25, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.03900949796472185, "frac_reward_zero_std": 0.5, "grad_norm": 0.050372909754514694, "kl": 0.0, "learning_rate": 4.889579020013803e-07, "loss": 0.0004, "num_tokens": 3846560.0, "reward": 0.75, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 115 }, { "completion_length": 3040.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4961.0, "completions/mean_length": 3589.08349609375, "completions/mean_terminated_length": 3316.36376953125, "completions/min_length": 1877.0, "completions/min_terminated_length": 1877.0, "epoch": 0.03934871099050204, "frac_reward_zero_std": 0.0, "grad_norm": 0.7781913876533508, "kl": NaN, "learning_rate": 4.887853692201518e-07, "loss": -0.038, "num_tokens": 3899264.0, "reward": 1.0250000953674316, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 116 }, { "completion_length": 3644.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6581.0, "completions/max_terminated_length": 6581.0, "completions/mean_length": 3644.25, "completions/mean_terminated_length": 3644.25, "completions/min_length": 1230.0, "completions/min_terminated_length": 1230.0, "epoch": 0.039687924016282225, "frac_reward_zero_std": 0.5, "grad_norm": 0.624457597732544, "kl": 0.0, "learning_rate": 4.886128364389234e-07, "loss": 0.0081, "num_tokens": 3961217.0, "reward": 1.1375000476837158, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 117 }, { "completion_length": 2229.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3979.0, "completions/max_terminated_length": 3979.0, "completions/mean_length": 2229.5, "completions/mean_terminated_length": 2229.5, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "epoch": 0.04002713704206241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.88440303657695e-07, "loss": 0.0, "num_tokens": 4004513.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 118 }, { "completion_length": 2228.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3815.0, "completions/max_terminated_length": 3815.0, "completions/mean_length": 2228.416748046875, "completions/mean_terminated_length": 2228.416748046875, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.0403663500678426, "frac_reward_zero_std": 0.0, "grad_norm": 0.7541438937187195, "kl": 0.0, "learning_rate": 4.882677708764665e-07, "loss": -0.0092, "num_tokens": 4043008.0, "reward": 0.7000000476837158, "reward_std": 0.41311824321746826, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 119 }, { "completion_length": 2333.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5397.0, "completions/max_terminated_length": 5397.0, "completions/mean_length": 2333.83349609375, "completions/mean_terminated_length": 2333.83349609375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.0407055630936228, "frac_reward_zero_std": 0.0, "grad_norm": 0.7183325290679932, "kl": 0.0, "learning_rate": 4.880952380952381e-07, "loss": 0.0125, "num_tokens": 4085732.0, "reward": 0.970833420753479, "reward_std": 0.3699861466884613, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 120 }, { "completion_length": 2344.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5423.0, "completions/max_terminated_length": 5423.0, "completions/mean_length": 2344.83349609375, "completions/mean_terminated_length": 2344.83349609375, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.041044776119402986, "frac_reward_zero_std": 0.0, "grad_norm": 0.18063339591026306, "kl": 0.0, "learning_rate": 4.879227053140096e-07, "loss": 0.0024, "num_tokens": 4126794.0, "reward": 1.2291667461395264, "reward_std": 0.0927189290523529, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 121 }, { "completion_length": 2123.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3868.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 2123.166748046875, "completions/mean_terminated_length": 2123.166748046875, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "epoch": 0.041383989145183174, "frac_reward_zero_std": 0.5, "grad_norm": 0.7253784537315369, "kl": 0.0, "learning_rate": 4.877501725327812e-07, "loss": 0.0166, "num_tokens": 4164896.0, "reward": 1.120833396911621, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 122 }, { "completion_length": 1970.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4138.0, "completions/max_terminated_length": 4138.0, "completions/mean_length": 1970.416748046875, "completions/mean_terminated_length": 1970.416748046875, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "epoch": 0.04172320217096336, "frac_reward_zero_std": 0.5, "grad_norm": 0.3625893294811249, "kl": 0.0, "learning_rate": 4.875776397515527e-07, "loss": 0.005, "num_tokens": 4200325.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 123 }, { "completion_length": 2338.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6154.0, "completions/max_terminated_length": 6154.0, "completions/mean_length": 2338.666748046875, "completions/mean_terminated_length": 2338.666748046875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 0.04206241519674356, "frac_reward_zero_std": 0.5, "grad_norm": 0.6652452349662781, "kl": 0.0, "learning_rate": 4.874051069703243e-07, "loss": 0.0208, "num_tokens": 4235175.0, "reward": 0.46250003576278687, "reward_std": 0.26016825437545776, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 124 }, { "completion_length": 1641.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3666.0, "completions/max_terminated_length": 3666.0, "completions/mean_length": 1641.75, "completions/mean_terminated_length": 1641.75, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.042401628222523746, "frac_reward_zero_std": 0.0, "grad_norm": 0.5879623889923096, "kl": 0.0, "learning_rate": 4.872325741890959e-07, "loss": 0.0249, "num_tokens": 4266114.0, "reward": 1.1041667461395264, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 125 }, { "completion_length": 2013.8333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6537.0, "completions/mean_length": 4210.1669921875, "completions/mean_terminated_length": 3020.75, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "epoch": 0.042740841248303935, "frac_reward_zero_std": 0.5, "grad_norm": 0.6589382290840149, "kl": NaN, "learning_rate": 4.870600414078675e-07, "loss": -0.093, "num_tokens": 4304308.0, "reward": 0.7333334684371948, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 126 }, { "completion_length": 1316.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1316.916748046875, "completions/mean_terminated_length": 1316.916748046875, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.04308005427408412, "frac_reward_zero_std": 0.0, "grad_norm": 0.490523099899292, "kl": 0.0, "learning_rate": 4.86887508626639e-07, "loss": -0.0003, "num_tokens": 4331583.0, "reward": 1.1708333492279053, "reward_std": 0.2863824963569641, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 127 }, { "completion_length": 956.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 956.1666870117188, "completions/mean_terminated_length": 956.1666870117188, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.04341926729986431, "frac_reward_zero_std": 0.5, "grad_norm": 0.34587809443473816, "kl": 0.0, "learning_rate": 4.867149758454106e-07, "loss": -0.0019, "num_tokens": 4355141.0, "reward": 0.8625000715255737, "reward_std": 0.26016825437545776, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 128 }, { "completion_length": 831.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 831.0833740234375, "completions/mean_terminated_length": 831.0833740234375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.04375848032564451, "frac_reward_zero_std": 0.5, "grad_norm": 0.08698862046003342, "kl": 0.0, "learning_rate": 4.865424430641822e-07, "loss": -0.0014, "num_tokens": 4378542.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 129 }, { "completion_length": 2297.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5522.0, "completions/max_terminated_length": 5522.0, "completions/mean_length": 2297.75, "completions/mean_terminated_length": 2297.75, "completions/min_length": 1058.0, "completions/min_terminated_length": 1058.0, "epoch": 0.044097693351424695, "frac_reward_zero_std": 0.5, "grad_norm": 0.546366810798645, "kl": 0.0, "learning_rate": 4.863699102829538e-07, "loss": -0.0185, "num_tokens": 4411467.0, "reward": 1.1041667461395264, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 130 }, { "completion_length": 2667.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4986.0, "completions/max_terminated_length": 4986.0, "completions/mean_length": 2667.33349609375, "completions/mean_terminated_length": 2667.33349609375, "completions/min_length": 1077.0, "completions/min_terminated_length": 1077.0, "epoch": 0.044436906377204884, "frac_reward_zero_std": 0.5, "grad_norm": 0.11569435894489288, "kl": 0.0, "learning_rate": 4.861973775017253e-07, "loss": -0.0001, "num_tokens": 4458631.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 131 }, { "completion_length": 1830.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 2379.08349609375, "completions/mean_terminated_length": 1996.3636474609375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.04477611940298507, "frac_reward_zero_std": 0.0, "grad_norm": 0.2605733573436737, "kl": NaN, "learning_rate": 4.860248447204969e-07, "loss": -0.0351, "num_tokens": 4493173.0, "reward": 1.0250000953674316, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 132 }, { "completion_length": 2761.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5889.0, "completions/mean_length": 3310.58349609375, "completions/mean_terminated_length": 3012.545654296875, "completions/min_length": 1545.0, "completions/min_terminated_length": 1545.0, "epoch": 0.04511533242876527, "frac_reward_zero_std": 0.0, "grad_norm": 0.6202500462532043, "kl": NaN, "learning_rate": 4.858523119392685e-07, "loss": -0.0471, "num_tokens": 4536811.0, "reward": 0.6458333730697632, "reward_std": 0.28442299365997314, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 133 }, { "completion_length": 1869.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3734.0, "completions/max_terminated_length": 3734.0, "completions/mean_length": 1869.666748046875, "completions/mean_terminated_length": 1869.666748046875, "completions/min_length": 1113.0, "completions/min_terminated_length": 1113.0, "epoch": 0.045454545454545456, "frac_reward_zero_std": 1.0, "grad_norm": 1.1787030729237813e-07, "kl": 0.0, "learning_rate": 4.8567977915804e-07, "loss": 0.0, "num_tokens": 4570359.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 134 }, { "completion_length": 1182.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2124.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 1182.666748046875, "completions/mean_terminated_length": 1182.666748046875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.045793758480325644, "frac_reward_zero_std": 0.5, "grad_norm": 0.0750177651643753, "kl": 0.0, "learning_rate": 4.855072463768116e-07, "loss": 0.001, "num_tokens": 4593809.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 135 }, { "completion_length": 1200.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5137.0, "completions/max_terminated_length": 5137.0, "completions/mean_length": 1200.5, "completions/mean_terminated_length": 1200.5, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.04613297150610583, "frac_reward_zero_std": 0.5, "grad_norm": 0.5664093494415283, "kl": 0.0, "learning_rate": 4.853347135955831e-07, "loss": 0.0321, "num_tokens": 4620119.0, "reward": 1.1666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 136 }, { "completion_length": 2200.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3977.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 2200.08349609375, "completions/mean_terminated_length": 2200.08349609375, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.04647218453188602, "frac_reward_zero_std": 0.5, "grad_norm": 0.42219477891921997, "kl": 0.0, "learning_rate": 4.851621808143547e-07, "loss": -0.0225, "num_tokens": 4662480.0, "reward": 1.0375001430511475, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 137 }, { "completion_length": 1934.75, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5036.0, "completions/mean_length": 4680.1669921875, "completions/mean_terminated_length": 3316.71435546875, "completions/min_length": 2530.0, "completions/min_terminated_length": 2530.0, "epoch": 0.046811397557666216, "frac_reward_zero_std": 0.5, "grad_norm": 0.17448575794696808, "kl": NaN, "learning_rate": 4.849896480331262e-07, "loss": -0.0143, "num_tokens": 4698939.0, "reward": 0.5875000953674316, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.14479610323905945, "step": 138 }, { "completion_length": 1352.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 1352.3333740234375, "completions/mean_terminated_length": 1352.3333740234375, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.047150610583446405, "frac_reward_zero_std": 0.5, "grad_norm": 0.29719895124435425, "kl": 0.0, "learning_rate": 4.848171152518978e-07, "loss": -0.0035, "num_tokens": 4724791.0, "reward": 1.1375000476837158, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 139 }, { "completion_length": 2913.33349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4754.0, "completions/mean_length": 3462.416748046875, "completions/mean_terminated_length": 3178.181884765625, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.04748982360922659, "frac_reward_zero_std": 0.5, "grad_norm": 0.5751773118972778, "kl": NaN, "learning_rate": 4.846445824706694e-07, "loss": -0.017, "num_tokens": 4769939.0, "reward": 0.4208333492279053, "reward_std": 0.21818380057811737, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 140 }, { "completion_length": 1474.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 1474.916748046875, "completions/mean_terminated_length": 1474.916748046875, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.04782903663500678, "frac_reward_zero_std": 0.5, "grad_norm": 0.07152793556451797, "kl": 0.0, "learning_rate": 4.84472049689441e-07, "loss": 0.0009, "num_tokens": 4803178.0, "reward": 1.1875, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 141 }, { "completion_length": 1793.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 1793.916748046875, "completions/mean_terminated_length": 1793.916748046875, "completions/min_length": 1249.0, "completions/min_terminated_length": 1249.0, "epoch": 0.04816824966078698, "frac_reward_zero_std": 0.0, "grad_norm": 0.1186007633805275, "kl": 0.0, "learning_rate": 4.842995169082126e-07, "loss": -0.0018, "num_tokens": 4837575.0, "reward": 1.1041667461395264, "reward_std": 0.07144343852996826, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 142 }, { "completion_length": 1622.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4223.0, "completions/max_terminated_length": 4223.0, "completions/mean_length": 1622.25, "completions/mean_terminated_length": 1622.25, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.048507462686567165, "frac_reward_zero_std": 0.5, "grad_norm": 0.09311472624540329, "kl": 0.0, "learning_rate": 4.841269841269841e-07, "loss": -0.001, "num_tokens": 4868670.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 143 }, { "completion_length": 1624.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4910.0, "completions/mean_length": 2722.916748046875, "completions/mean_terminated_length": 1949.7000732421875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.048846675712347354, "frac_reward_zero_std": 0.0, "grad_norm": 0.25399917364120483, "kl": NaN, "learning_rate": 4.839544513457557e-07, "loss": -0.0161, "num_tokens": 4895865.0, "reward": 0.6666667461395264, "reward_std": 0.11828449368476868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 144 }, { "completion_length": 1415.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3606.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 1415.75, "completions/mean_terminated_length": 1415.75, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.04918588873812754, "frac_reward_zero_std": 0.5, "grad_norm": 0.07228909432888031, "kl": 0.0, "learning_rate": 4.837819185645272e-07, "loss": -0.0003, "num_tokens": 4924656.0, "reward": 1.120833396911621, "reward_std": 0.04005204886198044, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 145 }, { "completion_length": 549.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 549.4166870117188, "completions/mean_terminated_length": 549.4166870117188, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.04952510176390773, "frac_reward_zero_std": 0.0, "grad_norm": 0.055849168449640274, "kl": 0.0, "learning_rate": 4.836093857832988e-07, "loss": 0.0002, "num_tokens": 4948643.0, "reward": 1.1541666984558105, "reward_std": 0.06024051457643509, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 146 }, { "completion_length": 1790.916748046875, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6085.0, "completions/mean_length": 4536.33349609375, "completions/mean_terminated_length": 3070.14306640625, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.049864314789687926, "frac_reward_zero_std": 0.5, "grad_norm": 1.2127513885498047, "kl": NaN, "learning_rate": 4.834368530020704e-07, "loss": -0.0705, "num_tokens": 4982428.0, "reward": 0.6416667699813843, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 147 }, { "completion_length": 2748.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3929.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 2748.416748046875, "completions/mean_terminated_length": 2748.416748046875, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.050203527815468114, "frac_reward_zero_std": 0.5, "grad_norm": 0.5216585397720337, "kl": 0.0, "learning_rate": 4.83264320220842e-07, "loss": -0.0151, "num_tokens": 5029227.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 148 }, { "completion_length": 1443.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 1443.416748046875, "completions/mean_terminated_length": 1443.416748046875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.0505427408412483, "frac_reward_zero_std": 0.0, "grad_norm": 0.31681346893310547, "kl": 0.0, "learning_rate": 4.830917874396135e-07, "loss": -0.0102, "num_tokens": 5056202.0, "reward": 0.6458333730697632, "reward_std": 0.2625694274902344, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 149 }, { "completion_length": 2423.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5794.0, "completions/max_terminated_length": 5794.0, "completions/mean_length": 2423.166748046875, "completions/mean_terminated_length": 2423.166748046875, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.05088195386702849, "frac_reward_zero_std": 0.5, "grad_norm": 0.12223686277866364, "kl": 0.0, "learning_rate": 4.829192546583851e-07, "loss": -0.0021, "num_tokens": 5094136.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 150 }, { "completion_length": 2325.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5329.0, "completions/max_terminated_length": 5329.0, "completions/mean_length": 2325.666748046875, "completions/mean_terminated_length": 2325.666748046875, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 0.05122116689280869, "frac_reward_zero_std": 0.0, "grad_norm": 0.9536461234092712, "kl": 0.0, "learning_rate": 4.827467218771566e-07, "loss": -0.0068, "num_tokens": 5137452.0, "reward": 0.7416666746139526, "reward_std": 0.4643779993057251, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 151 }, { "completion_length": 885.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 885.8333740234375, "completions/mean_terminated_length": 885.8333740234375, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.051560379918588875, "frac_reward_zero_std": 0.0, "grad_norm": 0.8494682908058167, "kl": 0.0, "learning_rate": 4.825741890959282e-07, "loss": 0.0056, "num_tokens": 5155720.0, "reward": 0.9250000715255737, "reward_std": 0.2563120126724243, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.07833494991064072, "step": 152 }, { "completion_length": 1869.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4111.0, "completions/max_terminated_length": 4111.0, "completions/mean_length": 1869.3333740234375, "completions/mean_terminated_length": 1869.3333740234375, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.05189959294436906, "frac_reward_zero_std": 0.0, "grad_norm": 0.1964230239391327, "kl": 0.0, "learning_rate": 4.824016563146997e-07, "loss": 0.0004, "num_tokens": 5188076.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 153 }, { "completion_length": 1415.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 1415.666748046875, "completions/mean_terminated_length": 1415.666748046875, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.05223880597014925, "frac_reward_zero_std": 0.0, "grad_norm": 0.35508859157562256, "kl": 0.0, "learning_rate": 4.822291235334713e-07, "loss": 0.0054, "num_tokens": 5217724.0, "reward": 0.7416666746139526, "reward_std": 0.20202915370464325, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 154 }, { "completion_length": 2550.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5538.0, "completions/max_terminated_length": 5538.0, "completions/mean_length": 2550.166748046875, "completions/mean_terminated_length": 2550.166748046875, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.05257801899592944, "frac_reward_zero_std": 0.5, "grad_norm": 0.22098730504512787, "kl": 0.0, "learning_rate": 4.82056590752243e-07, "loss": 0.0016, "num_tokens": 5257764.0, "reward": 1.120833396911621, "reward_std": 0.06785397976636887, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 155 }, { "completion_length": 693.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 693.25, "completions/mean_terminated_length": 693.25, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.052917232021709636, "frac_reward_zero_std": 0.5, "grad_norm": 0.05356336012482643, "kl": 0.0, "learning_rate": 4.818840579710144e-07, "loss": 0.0001, "num_tokens": 5279343.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 156 }, { "completion_length": 840.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 840.4166870117188, "completions/mean_terminated_length": 840.4166870117188, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.053256445047489824, "frac_reward_zero_std": 0.5, "grad_norm": 0.38578730821609497, "kl": 0.0, "learning_rate": 4.817115251897861e-07, "loss": -0.0039, "num_tokens": 5303306.0, "reward": 0.5416666269302368, "reward_std": 0.24528895318508148, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 157 }, { "completion_length": 2539.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6584.0, "completions/mean_length": 3637.83349609375, "completions/mean_terminated_length": 3047.60009765625, "completions/min_length": 1685.0, "completions/min_terminated_length": 1685.0, "epoch": 0.05359565807327001, "frac_reward_zero_std": 0.0, "grad_norm": 0.7518226504325867, "kl": NaN, "learning_rate": 4.815389924085576e-07, "loss": -0.0562, "num_tokens": 5345350.0, "reward": 0.8791667819023132, "reward_std": 0.4417826533317566, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 158 }, { "completion_length": 1451.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4533.0, "completions/max_terminated_length": 4533.0, "completions/mean_length": 1451.666748046875, "completions/mean_terminated_length": 1451.666748046875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.0539348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.09454381465911865, "kl": 0.0, "learning_rate": 4.813664596273292e-07, "loss": -0.0008, "num_tokens": 5374308.0, "reward": 1.1041667461395264, "reward_std": 0.05571504682302475, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 159 }, { "completion_length": 3554.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5146.0, "completions/max_terminated_length": 5146.0, "completions/mean_length": 3554.08349609375, "completions/mean_terminated_length": 3554.08349609375, "completions/min_length": 1727.0, "completions/min_terminated_length": 1727.0, "epoch": 0.054274084124830396, "frac_reward_zero_std": 1.0, "grad_norm": 5.624274308502208e-07, "kl": 0.0, "learning_rate": 4.811939268461007e-07, "loss": 0.0, "num_tokens": 5428465.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 160 }, { "completion_length": 1793.2500610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5850.0, "completions/mean_length": 2891.416748046875, "completions/mean_terminated_length": 2151.900146484375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.054613297150610585, "frac_reward_zero_std": 0.0, "grad_norm": 0.6140305995941162, "kl": NaN, "learning_rate": 4.810213940648723e-07, "loss": -0.102, "num_tokens": 5464558.0, "reward": 1.0166666507720947, "reward_std": 0.3798363208770752, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 161 }, { "completion_length": 2627.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6292.0, "completions/max_terminated_length": 6292.0, "completions/mean_length": 2627.666748046875, "completions/mean_terminated_length": 2627.666748046875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.05495251017639077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.808488612836439e-07, "loss": 0.0, "num_tokens": 5509680.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 162 }, { "completion_length": 2503.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4827.0, "completions/mean_length": 3052.916748046875, "completions/mean_terminated_length": 2731.45458984375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.05529172320217096, "frac_reward_zero_std": 0.0, "grad_norm": 0.7075985670089722, "kl": NaN, "learning_rate": 4.806763285024155e-07, "loss": 0.0062, "num_tokens": 5549920.0, "reward": 0.8583333492279053, "reward_std": 0.4465666711330414, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 163 }, { "completion_length": 2481.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4697.0, "completions/max_terminated_length": 4697.0, "completions/mean_length": 2481.75, "completions/mean_terminated_length": 2481.75, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.05563093622795115, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668363213539124, "kl": 0.0, "learning_rate": 4.80503795721187e-07, "loss": -0.0086, "num_tokens": 5592253.0, "reward": 1.066666841506958, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 164 }, { "completion_length": 1200.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1200.416748046875, "completions/mean_terminated_length": 1200.416748046875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.055970149253731345, "frac_reward_zero_std": 1.0, "grad_norm": 1.0779814374473062e-07, "kl": 0.0, "learning_rate": 4.803312629399586e-07, "loss": 0.0, "num_tokens": 5621964.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 165 }, { "completion_length": 1244.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/max_terminated_length": 3143.0, "completions/mean_length": 1244.5, "completions/mean_terminated_length": 1244.5, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.056309362279511534, "frac_reward_zero_std": 0.5, "grad_norm": 0.4018653631210327, "kl": 0.0, "learning_rate": 4.801587301587301e-07, "loss": 0.0058, "num_tokens": 5650680.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 166 }, { "completion_length": 620.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 620.6666870117188, "completions/mean_terminated_length": 620.6666870117188, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.05664857530529172, "frac_reward_zero_std": 0.5, "grad_norm": 0.042549144476652145, "kl": 0.0, "learning_rate": 4.799861973775017e-07, "loss": -0.0001, "num_tokens": 5670530.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 167 }, { "completion_length": 3173.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6019.0, "completions/max_terminated_length": 6019.0, "completions/mean_length": 3173.75, "completions/mean_terminated_length": 3173.75, "completions/min_length": 1452.0, "completions/min_terminated_length": 1452.0, "epoch": 0.05698778833107191, "frac_reward_zero_std": 0.0, "grad_norm": 0.9396628141403198, "kl": 0.0, "learning_rate": 4.798136645962732e-07, "loss": 0.0374, "num_tokens": 5720045.0, "reward": 0.9541667699813843, "reward_std": 0.42716550827026367, "rewards/correctness_reward_func/mean": 0.6666666269302368, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 168 }, { "completion_length": 1277.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 1277.8333740234375, "completions/mean_terminated_length": 1277.8333740234375, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.057327001356852106, "frac_reward_zero_std": 1.0, "grad_norm": 2.870347088901326e-07, "kl": 0.0, "learning_rate": 4.796411318150448e-07, "loss": 0.0, "num_tokens": 5749149.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 169 }, { "completion_length": 1964.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3663.0, "completions/max_terminated_length": 3663.0, "completions/mean_length": 1964.0, "completions/mean_terminated_length": 1964.0, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.057666214382632294, "frac_reward_zero_std": 0.5, "grad_norm": 0.475134015083313, "kl": 0.0, "learning_rate": 4.794685990338165e-07, "loss": -0.0215, "num_tokens": 5786289.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 170 }, { "completion_length": 1437.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1437.3333740234375, "completions/mean_terminated_length": 1437.3333740234375, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.05800542740841248, "frac_reward_zero_std": 0.0, "grad_norm": 0.4975443482398987, "kl": 0.0, "learning_rate": 4.792960662525879e-07, "loss": 0.0067, "num_tokens": 5814013.0, "reward": 0.9458333849906921, "reward_std": 0.39763349294662476, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 171 }, { "completion_length": 3024.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6300.0, "completions/mean_length": 3573.666748046875, "completions/mean_terminated_length": 3299.545654296875, "completions/min_length": 1574.0, "completions/min_terminated_length": 1574.0, "epoch": 0.05834464043419267, "frac_reward_zero_std": 0.0, "grad_norm": 0.2008449286222458, "kl": NaN, "learning_rate": 4.791235334713596e-07, "loss": -0.011, "num_tokens": 5861486.0, "reward": 0.7375000715255737, "reward_std": 0.0853908509016037, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 172 }, { "completion_length": 2863.7501220703125, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5883.0, "completions/mean_length": 4511.0, "completions/mean_terminated_length": 3818.333251953125, "completions/min_length": 2050.0, "completions/min_terminated_length": 2050.0, "epoch": 0.05868385345997286, "frac_reward_zero_std": 0.0, "grad_norm": 0.30197957158088684, "kl": NaN, "learning_rate": 4.789510006901311e-07, "loss": -0.0339, "num_tokens": 5903651.0, "reward": 0.22500000894069672, "reward_std": 0.13869690895080566, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 173 }, { "completion_length": 1077.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3142.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 1077.8333740234375, "completions/mean_terminated_length": 1077.8333740234375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.059023066485753055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.787784679089027e-07, "loss": 0.0, "num_tokens": 5927547.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 174 }, { "completion_length": 2095.3334350585938, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6126.0, "completions/mean_length": 4840.75, "completions/mean_terminated_length": 3592.000244140625, "completions/min_length": 1967.0, "completions/min_terminated_length": 1967.0, "epoch": 0.05936227951153324, "frac_reward_zero_std": 0.5, "grad_norm": 0.6622049808502197, "kl": NaN, "learning_rate": 4.786059351276742e-07, "loss": -0.071, "num_tokens": 5961595.0, "reward": 0.24166667461395264, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 175 }, { "completion_length": 2191.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3487.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2191.75, "completions/mean_terminated_length": 2191.75, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.05970149253731343, "frac_reward_zero_std": 0.5, "grad_norm": 0.6641126275062561, "kl": 0.0, "learning_rate": 4.784334023464458e-07, "loss": -0.0095, "num_tokens": 5995228.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 176 }, { "completion_length": 2058.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5513.0, "completions/max_terminated_length": 5513.0, "completions/mean_length": 2058.166748046875, "completions/mean_terminated_length": 2058.166748046875, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 0.06004070556309362, "frac_reward_zero_std": 1.0, "grad_norm": 2.813504522691801e-07, "kl": 0.0, "learning_rate": 4.782608695652174e-07, "loss": 0.0, "num_tokens": 6032094.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 177 }, { "completion_length": 2605.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5421.0, "completions/max_terminated_length": 5421.0, "completions/mean_length": 2605.0, "completions/mean_terminated_length": 2605.0, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.060379918588873815, "frac_reward_zero_std": 0.0, "grad_norm": 0.9859559535980225, "kl": 0.0, "learning_rate": 4.780883367839889e-07, "loss": -0.0052, "num_tokens": 6076770.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 178 }, { "completion_length": 2074.8334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4515.0, "completions/mean_length": 2623.916748046875, "completions/mean_terminated_length": 2263.45458984375, "completions/min_length": 1183.0, "completions/min_terminated_length": 1183.0, "epoch": 0.060719131614654004, "frac_reward_zero_std": 0.0, "grad_norm": 0.6357032656669617, "kl": NaN, "learning_rate": 4.779158040027605e-07, "loss": -0.0209, "num_tokens": 6112522.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 179 }, { "completion_length": 1129.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 1129.0833740234375, "completions/mean_terminated_length": 1129.0833740234375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.06105834464043419, "frac_reward_zero_std": 0.0, "grad_norm": 0.09178594499826431, "kl": 0.0, "learning_rate": 4.777432712215321e-07, "loss": -0.001, "num_tokens": 6134249.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 180 }, { "completion_length": 2996.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5630.0, "completions/max_terminated_length": 5630.0, "completions/mean_length": 2996.5, "completions/mean_terminated_length": 2996.5, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.06139755766621438, "frac_reward_zero_std": 0.5, "grad_norm": 0.6492721438407898, "kl": 0.0, "learning_rate": 4.775707384403036e-07, "loss": 0.0108, "num_tokens": 6184307.0, "reward": 0.9666668176651001, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 181 }, { "completion_length": 2291.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4261.0, "completions/max_terminated_length": 4261.0, "completions/mean_length": 2291.25, "completions/mean_terminated_length": 2291.25, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.06173677069199457, "frac_reward_zero_std": 0.5, "grad_norm": 0.625225841999054, "kl": 0.0, "learning_rate": 4.773982056590752e-07, "loss": 0.0127, "num_tokens": 6219686.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 182 }, { "completion_length": 1227.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 1776.166748046875, "completions/mean_terminated_length": 1338.6363525390625, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.062075983717774764, "frac_reward_zero_std": 0.0, "grad_norm": 0.7521762847900391, "kl": NaN, "learning_rate": 4.772256728778468e-07, "loss": -0.0184, "num_tokens": 6244383.0, "reward": 0.9291666746139526, "reward_std": 0.41845452785491943, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 183 }, { "completion_length": 2018.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6042.0, "completions/max_terminated_length": 6042.0, "completions/mean_length": 2018.0833740234375, "completions/mean_terminated_length": 2018.0833740234375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.06241519674355495, "frac_reward_zero_std": 0.5, "grad_norm": 0.6339729428291321, "kl": 0.0, "learning_rate": 4.770531400966183e-07, "loss": -0.0363, "num_tokens": 6279580.0, "reward": 0.7583333849906921, "reward_std": 0.21946904063224792, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 184 }, { "completion_length": 2131.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4085.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 2131.0, "completions/mean_terminated_length": 2131.0, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "epoch": 0.06275440976933515, "frac_reward_zero_std": 1.0, "grad_norm": 3.2719327691665967e-07, "kl": 0.0, "learning_rate": 4.7688060731539e-07, "loss": 0.0, "num_tokens": 6317164.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 185 }, { "completion_length": 1139.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 1139.8333740234375, "completions/mean_terminated_length": 1139.8333740234375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.06309362279511534, "frac_reward_zero_std": 1.0, "grad_norm": 1.9935814066229796e-07, "kl": 0.0, "learning_rate": 4.7670807453416146e-07, "loss": 0.0, "num_tokens": 6345572.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 186 }, { "completion_length": 2896.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4331.0, "completions/max_terminated_length": 4331.0, "completions/mean_length": 2896.08349609375, "completions/mean_terminated_length": 2896.08349609375, "completions/min_length": 1513.0, "completions/min_terminated_length": 1513.0, "epoch": 0.06343283582089553, "frac_reward_zero_std": 1.0, "grad_norm": 3.4238826174259884e-07, "kl": 0.0, "learning_rate": 4.76535541752933e-07, "loss": 0.0, "num_tokens": 6390285.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 187 }, { "completion_length": 1860.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4907.0, "completions/max_terminated_length": 4907.0, "completions/mean_length": 1860.25, "completions/mean_terminated_length": 1860.25, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.06377204884667571, "frac_reward_zero_std": 0.5, "grad_norm": 1.6667431592941284, "kl": 0.0, "learning_rate": 4.763630089717046e-07, "loss": 0.0349, "num_tokens": 6425994.0, "reward": 1.0833334922790527, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 188 }, { "completion_length": 1321.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3460.0, "completions/max_terminated_length": 3460.0, "completions/mean_length": 1321.666748046875, "completions/mean_terminated_length": 1321.666748046875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.0641112618724559, "frac_reward_zero_std": 0.5, "grad_norm": 0.37701964378356934, "kl": 0.0, "learning_rate": 4.761904761904761e-07, "loss": 0.0042, "num_tokens": 6448568.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 189 }, { "completion_length": 2525.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4880.0, "completions/max_terminated_length": 4880.0, "completions/mean_length": 2525.33349609375, "completions/mean_terminated_length": 2525.33349609375, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.06445047489823609, "frac_reward_zero_std": 0.0, "grad_norm": 1.206994891166687, "kl": 0.0, "learning_rate": 4.7601794340924773e-07, "loss": 0.0475, "num_tokens": 6491160.0, "reward": 0.8333333730697632, "reward_std": 0.4772879481315613, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 190 }, { "completion_length": 994.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 994.5833740234375, "completions/mean_terminated_length": 994.5833740234375, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.06478968792401628, "frac_reward_zero_std": 1.0, "grad_norm": 2.1302626862507168e-07, "kl": 0.0, "learning_rate": 4.758454106280193e-07, "loss": 0.0, "num_tokens": 6513985.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 191 }, { "completion_length": 1926.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 1926.0, "completions/mean_terminated_length": 1926.0, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.06512890094979647, "frac_reward_zero_std": 1.0, "grad_norm": 2.765222859579808e-07, "kl": 0.0, "learning_rate": 4.756728778467909e-07, "loss": 0.0, "num_tokens": 6548065.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 192 }, { "completion_length": 2050.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3179.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 2050.666748046875, "completions/mean_terminated_length": 2050.666748046875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.06546811397557666, "frac_reward_zero_std": 1.0, "grad_norm": 1.0529081606591717e-07, "kl": 0.0, "learning_rate": 4.7550034506556244e-07, "loss": 0.0, "num_tokens": 6586623.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 193 }, { "completion_length": 2184.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3969.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 2184.58349609375, "completions/mean_terminated_length": 2184.58349609375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.06580732700135686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.75327812284334e-07, "loss": 0.0, "num_tokens": 6622516.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 194 }, { "completion_length": 1352.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2634.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 1352.666748046875, "completions/mean_terminated_length": 1352.666748046875, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.06614654002713705, "frac_reward_zero_std": 0.5, "grad_norm": 0.10883874446153641, "kl": 0.0, "learning_rate": 4.751552795031056e-07, "loss": -0.0022, "num_tokens": 6653136.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 195 }, { "completion_length": 1927.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4352.0, "completions/mean_length": 3025.166748046875, "completions/mean_terminated_length": 2312.400146484375, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "epoch": 0.06648575305291723, "frac_reward_zero_std": 0.5, "grad_norm": 0.4869142770767212, "kl": NaN, "learning_rate": 4.7498274672187715e-07, "loss": -0.0407, "num_tokens": 6689124.0, "reward": 0.8500000834465027, "reward_std": 0.279284805059433, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 196 }, { "completion_length": 1327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1327.0, "completions/mean_terminated_length": 1327.0, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.06682496607869742, "frac_reward_zero_std": 1.0, "grad_norm": 9.299483849645185e-08, "kl": 0.0, "learning_rate": 4.748102139406487e-07, "loss": 0.0, "num_tokens": 6719208.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 197 }, { "completion_length": 1178.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1178.8333740234375, "completions/mean_terminated_length": 1178.8333740234375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.06716417910447761, "frac_reward_zero_std": 0.5, "grad_norm": 0.08082883059978485, "kl": 0.0, "learning_rate": 4.7463768115942026e-07, "loss": -0.0002, "num_tokens": 6742648.0, "reward": 0.6375000476837158, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 198 }, { "completion_length": 2507.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5850.0, "completions/max_terminated_length": 5850.0, "completions/mean_length": 2507.416748046875, "completions/mean_terminated_length": 2507.416748046875, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "epoch": 0.0675033921302578, "frac_reward_zero_std": 0.5, "grad_norm": 0.6426955461502075, "kl": 0.0, "learning_rate": 4.7446514837819186e-07, "loss": -0.0252, "num_tokens": 6785319.0, "reward": 1.0166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 199 }, { "completion_length": 1075.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 1075.0833740234375, "completions/mean_terminated_length": 1075.0833740234375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.06784260515603799, "frac_reward_zero_std": 0.5, "grad_norm": 0.05683635175228119, "kl": 0.0, "learning_rate": 4.7429261559696336e-07, "loss": 0.0, "num_tokens": 6809584.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 200 }, { "completion_length": 3697.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6119.0, "completions/max_terminated_length": 6119.0, "completions/mean_length": 3697.58349609375, "completions/mean_terminated_length": 3697.58349609375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "epoch": 0.06818181818181818, "frac_reward_zero_std": 1.0, "grad_norm": 1.3760663364337233e-07, "kl": 0.0, "learning_rate": 4.7412008281573497e-07, "loss": 0.0, "num_tokens": 6863723.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 201 }, { "completion_length": 2568.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4082.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 2568.916748046875, "completions/mean_terminated_length": 2568.916748046875, "completions/min_length": 1818.0, "completions/min_terminated_length": 1818.0, "epoch": 0.06852103120759837, "frac_reward_zero_std": 0.5, "grad_norm": 0.7128849029541016, "kl": 0.0, "learning_rate": 4.739475500345065e-07, "loss": -0.0046, "num_tokens": 6904072.0, "reward": 1.1041667461395264, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 202 }, { "completion_length": 2055.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3684.0, "completions/max_terminated_length": 3684.0, "completions/mean_length": 2055.416748046875, "completions/mean_terminated_length": 2055.416748046875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 0.06886024423337857, "frac_reward_zero_std": 0.0, "grad_norm": 0.10221155732870102, "kl": 0.0, "learning_rate": 4.737750172532781e-07, "loss": 0.0007, "num_tokens": 6943305.0, "reward": 1.0750000476837158, "reward_std": 0.06123722717165947, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 203 }, { "completion_length": 1224.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3263.0, "completions/max_terminated_length": 3263.0, "completions/mean_length": 1224.25, "completions/mean_terminated_length": 1224.25, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.06919945725915876, "frac_reward_zero_std": 1.0, "grad_norm": 3.1618932894161844e-07, "kl": 0.0, "learning_rate": 4.736024844720496e-07, "loss": 0.0, "num_tokens": 6971046.0, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.07833494991064072, "step": 204 }, { "completion_length": 1673.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3900.0, "completions/max_terminated_length": 3900.0, "completions/mean_length": 1673.8333740234375, "completions/mean_terminated_length": 1673.8333740234375, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.06953867028493894, "frac_reward_zero_std": 1.0, "grad_norm": 3.1687034152128035e-07, "kl": 0.0, "learning_rate": 4.7342995169082123e-07, "loss": 0.0, "num_tokens": 7004194.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 205 }, { "completion_length": 1079.5, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4487.0, "completions/mean_length": 3275.83349609375, "completions/mean_terminated_length": 1619.25, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.06987788331071913, "frac_reward_zero_std": 0.0, "grad_norm": 0.931649923324585, "kl": NaN, "learning_rate": 4.732574189095928e-07, "loss": -0.0345, "num_tokens": 7028488.0, "reward": 0.7666666507720947, "reward_std": 0.24533745646476746, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.133143812417984, "step": 206 }, { "completion_length": 1299.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3137.0, "completions/max_terminated_length": 3137.0, "completions/mean_length": 1299.666748046875, "completions/mean_terminated_length": 1299.666748046875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.07021709633649932, "frac_reward_zero_std": 0.5, "grad_norm": 0.07907649129629135, "kl": 0.0, "learning_rate": 4.730848861283644e-07, "loss": -0.0023, "num_tokens": 7053222.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 207 }, { "completion_length": 1989.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 1989.5, "completions/mean_terminated_length": 1989.5, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 0.07055630936227951, "frac_reward_zero_std": 0.0, "grad_norm": 2.503600597381592, "kl": 0.0, "learning_rate": 4.7291235334713594e-07, "loss": 0.0031, "num_tokens": 7089882.0, "reward": 0.8500000834465027, "reward_std": 0.2473839521408081, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 208 }, { "completion_length": 2048.5834350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5063.0, "completions/mean_length": 3146.75, "completions/mean_terminated_length": 2458.300048828125, "completions/min_length": 1278.0, "completions/min_terminated_length": 1278.0, "epoch": 0.0708955223880597, "frac_reward_zero_std": 0.5, "grad_norm": 6.7276811599731445, "kl": NaN, "learning_rate": 4.727398205659075e-07, "loss": -0.0123, "num_tokens": 7128205.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 209 }, { "completion_length": 2508.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6519.0, "completions/mean_length": 4155.4169921875, "completions/mean_terminated_length": 3344.22216796875, "completions/min_length": 1361.0, "completions/min_terminated_length": 1361.0, "epoch": 0.07123473541383989, "frac_reward_zero_std": 0.5, "grad_norm": 0.17133860290050507, "kl": NaN, "learning_rate": 4.725672877846791e-07, "loss": -0.0279, "num_tokens": 7167573.0, "reward": 0.6250001192092896, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 210 }, { "completion_length": 1452.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3934.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 1452.0, "completions/mean_terminated_length": 1452.0, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.07157394843962007, "frac_reward_zero_std": 0.0, "grad_norm": 0.1274157166481018, "kl": 0.0, "learning_rate": 4.723947550034506e-07, "loss": -0.0028, "num_tokens": 7198827.0, "reward": 1.2000000476837158, "reward_std": 0.09350206702947617, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.07385490089654922, "step": 211 }, { "completion_length": 1059.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 1059.5833740234375, "completions/mean_terminated_length": 1059.5833740234375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.07191316146540028, "frac_reward_zero_std": 0.5, "grad_norm": 0.049741536378860474, "kl": 0.0, "learning_rate": 4.722222222222222e-07, "loss": -0.0011, "num_tokens": 7226206.0, "reward": 1.1875, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 212 }, { "completion_length": 2186.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4565.0, "completions/mean_length": 2735.83349609375, "completions/mean_terminated_length": 2385.54541015625, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.07225237449118047, "frac_reward_zero_std": 0.0, "grad_norm": 0.7999017834663391, "kl": NaN, "learning_rate": 4.7204968944099376e-07, "loss": -0.0145, "num_tokens": 7262761.0, "reward": 0.658333420753479, "reward_std": 0.25531625747680664, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 213 }, { "completion_length": 2076.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4194.0, "completions/mean_length": 2625.08349609375, "completions/mean_terminated_length": 2264.727294921875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.07259158751696065, "frac_reward_zero_std": 0.0, "grad_norm": 0.8687799572944641, "kl": NaN, "learning_rate": 4.7187715665976537e-07, "loss": -0.02, "num_tokens": 7298377.0, "reward": 0.625, "reward_std": 0.5025304555892944, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.10112998634576797, "step": 214 }, { "completion_length": 1112.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1112.5, "completions/mean_terminated_length": 1112.5, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.07293080054274084, "frac_reward_zero_std": 1.0, "grad_norm": 1.1586325143753129e-07, "kl": 0.0, "learning_rate": 4.7170462387853687e-07, "loss": 0.0, "num_tokens": 7324171.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 215 }, { "completion_length": 1243.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 1243.916748046875, "completions/mean_terminated_length": 1243.916748046875, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.07327001356852103, "frac_reward_zero_std": 0.5, "grad_norm": 0.3274558484554291, "kl": 0.0, "learning_rate": 4.7153209109730847e-07, "loss": -0.0002, "num_tokens": 7354998.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 216 }, { "completion_length": 1607.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4015.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 1607.0833740234375, "completions/mean_terminated_length": 1607.0833740234375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.07360922659430122, "frac_reward_zero_std": 0.5, "grad_norm": 0.11406178027391434, "kl": 0.0, "learning_rate": 4.7135955831608e-07, "loss": -0.0028, "num_tokens": 7391251.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 217 }, { "completion_length": 774.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 774.5833740234375, "completions/mean_terminated_length": 774.5833740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.07394843962008141, "frac_reward_zero_std": 0.5, "grad_norm": 0.0900874137878418, "kl": 0.0, "learning_rate": 4.7118702553485163e-07, "loss": 0.0, "num_tokens": 7416254.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 218 }, { "completion_length": 1318.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 1318.0, "completions/mean_terminated_length": 1318.0, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.0742876526458616, "frac_reward_zero_std": 0.5, "grad_norm": 0.0830475240945816, "kl": 0.0, "learning_rate": 4.7101449275362313e-07, "loss": -0.0019, "num_tokens": 7442162.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 219 }, { "completion_length": 1440.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3651.0, "completions/max_terminated_length": 3651.0, "completions/mean_length": 1440.75, "completions/mean_terminated_length": 1440.75, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.07462686567164178, "frac_reward_zero_std": 0.5, "grad_norm": 0.1543527990579605, "kl": 0.0, "learning_rate": 4.7084195997239474e-07, "loss": -0.0023, "num_tokens": 7466825.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 220 }, { "completion_length": 1963.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 1963.0, "completions/mean_terminated_length": 1963.0, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.07496607869742199, "frac_reward_zero_std": 0.5, "grad_norm": 0.09828822314739227, "kl": 0.0, "learning_rate": 4.706694271911663e-07, "loss": 0.002, "num_tokens": 7500995.0, "reward": 1.120833396911621, "reward_std": 0.04005204886198044, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 221 }, { "completion_length": 1688.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4377.0, "completions/max_terminated_length": 4377.0, "completions/mean_length": 1688.8333740234375, "completions/mean_terminated_length": 1688.8333740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.07530529172320218, "frac_reward_zero_std": 0.5, "grad_norm": 0.280002236366272, "kl": 0.0, "learning_rate": 4.704968944099379e-07, "loss": 0.0004, "num_tokens": 7535055.0, "reward": 0.9333333373069763, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 222 }, { "completion_length": 1618.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 1618.666748046875, "completions/mean_terminated_length": 1618.666748046875, "completions/min_length": 1044.0, "completions/min_terminated_length": 1044.0, "epoch": 0.07564450474898236, "frac_reward_zero_std": 0.0, "grad_norm": 0.5872350931167603, "kl": 0.0, "learning_rate": 4.7032436162870945e-07, "loss": 0.0123, "num_tokens": 7569689.0, "reward": 1.087499976158142, "reward_std": 0.3197711706161499, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 223 }, { "completion_length": 1416.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 1416.8333740234375, "completions/mean_terminated_length": 1416.8333740234375, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.07598371777476255, "frac_reward_zero_std": 1.0, "grad_norm": 2.0870072603429435e-07, "kl": 0.0, "learning_rate": 4.70151828847481e-07, "loss": 0.0, "num_tokens": 7598295.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 224 }, { "completion_length": 1122.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 1122.75, "completions/mean_terminated_length": 1122.75, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.07632293080054274, "frac_reward_zero_std": 0.0, "grad_norm": 0.3819059133529663, "kl": 0.0, "learning_rate": 4.699792960662526e-07, "loss": -0.0098, "num_tokens": 7625124.0, "reward": 1.0833333730697632, "reward_std": 0.2010922133922577, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.07385490089654922, "step": 225 }, { "completion_length": 2394.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3629.0, "completions/max_terminated_length": 3629.0, "completions/mean_length": 2394.666748046875, "completions/mean_terminated_length": 2394.666748046875, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "epoch": 0.07666214382632293, "frac_reward_zero_std": 0.0, "grad_norm": 0.9332161545753479, "kl": 0.0, "learning_rate": 4.698067632850241e-07, "loss": 0.0003, "num_tokens": 7666064.0, "reward": 0.9166666865348816, "reward_std": 0.3129710853099823, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 226 }, { "completion_length": 864.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 864.1666870117188, "completions/mean_terminated_length": 864.1666870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.07700135685210312, "frac_reward_zero_std": 1.0, "grad_norm": 7.138903868053603e-08, "kl": 0.0, "learning_rate": 4.696342305037957e-07, "loss": 0.0, "num_tokens": 7688386.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 227 }, { "completion_length": 2737.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4239.0, "completions/max_terminated_length": 4239.0, "completions/mean_length": 2737.5, "completions/mean_terminated_length": 2737.5, "completions/min_length": 1432.0, "completions/min_terminated_length": 1432.0, "epoch": 0.0773405698778833, "frac_reward_zero_std": 0.0, "grad_norm": 0.6406996846199036, "kl": 0.0, "learning_rate": 4.6946169772256726e-07, "loss": 0.0023, "num_tokens": 7735372.0, "reward": 0.8000000715255737, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 228 }, { "completion_length": 1794.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5741.0, "completions/max_terminated_length": 5741.0, "completions/mean_length": 1794.166748046875, "completions/mean_terminated_length": 1794.166748046875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.0776797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 2.823469458235195e-07, "kl": 0.0, "learning_rate": 4.6928916494133887e-07, "loss": 0.0, "num_tokens": 7763172.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 229 }, { "completion_length": 2012.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3545.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2012.166748046875, "completions/mean_terminated_length": 2012.166748046875, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.0780189959294437, "frac_reward_zero_std": 0.5, "grad_norm": 0.5563703775405884, "kl": 0.0, "learning_rate": 4.6911663216011037e-07, "loss": -0.0003, "num_tokens": 7796900.0, "reward": 0.8500000834465027, "reward_std": 0.2345207929611206, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 230 }, { "completion_length": 1420.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 1420.75, "completions/mean_terminated_length": 1420.75, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.07835820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.374015748500824, "kl": 0.0, "learning_rate": 4.68944099378882e-07, "loss": -0.002, "num_tokens": 7822997.0, "reward": 0.6541666984558105, "reward_std": 0.21588000655174255, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 231 }, { "completion_length": 1682.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4546.0, "completions/max_terminated_length": 4546.0, "completions/mean_length": 1682.166748046875, "completions/mean_terminated_length": 1682.166748046875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.07869742198100407, "frac_reward_zero_std": 0.5, "grad_norm": 0.07696101814508438, "kl": 0.0, "learning_rate": 4.6877156659765353e-07, "loss": -0.0003, "num_tokens": 7851037.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 232 }, { "completion_length": 1198.5833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6101.0, "completions/mean_length": 3394.916748046875, "completions/mean_terminated_length": 1797.875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.07903663500678426, "frac_reward_zero_std": 0.0, "grad_norm": 1.621031403541565, "kl": NaN, "learning_rate": 4.6859903381642513e-07, "loss": -0.0886, "num_tokens": 7876436.0, "reward": 0.7625000476837158, "reward_std": 0.3166946768760681, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 233 }, { "completion_length": 2415.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5319.0, "completions/max_terminated_length": 5319.0, "completions/mean_length": 2415.08349609375, "completions/mean_terminated_length": 2415.08349609375, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.07937584803256445, "frac_reward_zero_std": 0.5, "grad_norm": 0.14514191448688507, "kl": 0.0, "learning_rate": 4.6842650103519663e-07, "loss": -0.0108, "num_tokens": 7914051.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 234 }, { "completion_length": 1313.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3703.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 1313.75, "completions/mean_terminated_length": 1313.75, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.07971506105834464, "frac_reward_zero_std": 0.0, "grad_norm": 1.3174349069595337, "kl": 0.0, "learning_rate": 4.6825396825396824e-07, "loss": 0.001, "num_tokens": 7941624.0, "reward": 0.7125000357627869, "reward_std": 0.09585144370794296, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 235 }, { "completion_length": 800.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 800.3333740234375, "completions/mean_terminated_length": 800.3333740234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.08005427408412483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.680814354727398e-07, "loss": 0.0, "num_tokens": 7963864.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 236 }, { "completion_length": 2569.166748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4853.0, "completions/mean_length": 3667.33349609375, "completions/mean_terminated_length": 3083.0, "completions/min_length": 1522.0, "completions/min_terminated_length": 1522.0, "epoch": 0.08039348710990502, "frac_reward_zero_std": 0.0, "grad_norm": 1.3099312782287598, "kl": NaN, "learning_rate": 4.6790890269151135e-07, "loss": -0.0863, "num_tokens": 8008752.0, "reward": 0.9833334684371948, "reward_std": 0.35421618819236755, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 237 }, { "completion_length": 883.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 883.4166870117188, "completions/mean_terminated_length": 883.4166870117188, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.0807327001356852, "frac_reward_zero_std": 0.0, "grad_norm": 0.08914493024349213, "kl": 0.0, "learning_rate": 4.677363699102829e-07, "loss": 0.001, "num_tokens": 8031389.0, "reward": 0.6916667819023132, "reward_std": 0.07955466210842133, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 238 }, { "completion_length": 1487.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3340.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 1487.25, "completions/mean_terminated_length": 1487.25, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.0810719131614654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.675638371290545e-07, "loss": 0.0, "num_tokens": 8063738.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 239 }, { "completion_length": 3740.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5830.0, "completions/max_terminated_length": 5830.0, "completions/mean_length": 3740.416748046875, "completions/mean_terminated_length": 3740.416748046875, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "epoch": 0.0814111261872456, "frac_reward_zero_std": 0.5, "grad_norm": 0.10733194649219513, "kl": 0.0, "learning_rate": 4.673913043478261e-07, "loss": -0.005, "num_tokens": 8124595.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 240 }, { "completion_length": 1372.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3355.0, "completions/max_terminated_length": 3355.0, "completions/mean_length": 1372.8333740234375, "completions/mean_terminated_length": 1372.8333740234375, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.08175033921302578, "frac_reward_zero_std": 1.0, "grad_norm": 2.8452433298298274e-07, "kl": 0.0, "learning_rate": 4.672187715665976e-07, "loss": 0.0, "num_tokens": 8154857.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 241 }, { "completion_length": 2199.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5927.0, "completions/max_terminated_length": 5927.0, "completions/mean_length": 2199.25, "completions/mean_terminated_length": 2199.25, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.08208955223880597, "frac_reward_zero_std": 0.5, "grad_norm": 0.19338767230510712, "kl": 0.0, "learning_rate": 4.670462387853692e-07, "loss": -0.0023, "num_tokens": 8192258.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 242 }, { "completion_length": 1042.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1042.75, "completions/mean_terminated_length": 1042.75, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.08242876526458616, "frac_reward_zero_std": 0.5, "grad_norm": 0.4441406726837158, "kl": 0.0, "learning_rate": 4.6687370600414077e-07, "loss": -0.0016, "num_tokens": 8216441.0, "reward": 1.1000001430511475, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 243 }, { "completion_length": 1917.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5127.0, "completions/max_terminated_length": 5127.0, "completions/mean_length": 1917.916748046875, "completions/mean_terminated_length": 1917.916748046875, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.08276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 1.564266796094671e-07, "kl": 0.0, "learning_rate": 4.667011732229124e-07, "loss": 0.0, "num_tokens": 8256160.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 244 }, { "completion_length": 1719.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 1719.0833740234375, "completions/mean_terminated_length": 1719.0833740234375, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.08310719131614654, "frac_reward_zero_std": 0.0, "grad_norm": 0.6577960848808289, "kl": 0.0, "learning_rate": 4.665286404416839e-07, "loss": -0.0004, "num_tokens": 8285387.0, "reward": 1.0374999046325684, "reward_std": 0.3044798970222473, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 245 }, { "completion_length": 2581.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4833.0, "completions/max_terminated_length": 4833.0, "completions/mean_length": 2581.75, "completions/mean_terminated_length": 2581.75, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 0.08344640434192672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.663561076604555e-07, "loss": 0.0, "num_tokens": 8325866.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 246 }, { "completion_length": 1798.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 1798.5, "completions/mean_terminated_length": 1798.5, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.08378561736770691, "frac_reward_zero_std": 1.0, "grad_norm": 3.0002641437931743e-07, "kl": 0.0, "learning_rate": 4.6618357487922703e-07, "loss": 0.0, "num_tokens": 8360000.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 247 }, { "completion_length": 1804.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3874.0, "completions/max_terminated_length": 3874.0, "completions/mean_length": 1804.666748046875, "completions/mean_terminated_length": 1804.666748046875, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.08412483039348712, "frac_reward_zero_std": 0.5, "grad_norm": 0.7278051376342773, "kl": 0.0, "learning_rate": 4.660110420979986e-07, "loss": -0.0089, "num_tokens": 8396068.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 248 }, { "completion_length": 1632.5000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6421.0, "completions/mean_length": 2730.666748046875, "completions/mean_terminated_length": 1959.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.0844640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 2.0582363605499268, "kl": NaN, "learning_rate": 4.6583850931677014e-07, "loss": -0.0822, "num_tokens": 8426572.0, "reward": 0.9833334684371948, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 249 }, { "completion_length": 936.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 936.4166870117188, "completions/mean_terminated_length": 936.4166870117188, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.08480325644504749, "frac_reward_zero_std": 0.0, "grad_norm": 0.09999001771211624, "kl": 0.0, "learning_rate": 4.6566597653554174e-07, "loss": -0.0015, "num_tokens": 8449605.0, "reward": 1.0916666984558105, "reward_std": 0.07955464720726013, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 250 }, { "completion_length": 1101.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1101.3333740234375, "completions/mean_terminated_length": 1101.3333740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.08514246947082768, "frac_reward_zero_std": 0.5, "grad_norm": 0.057271238416433334, "kl": 0.0, "learning_rate": 4.654934437543133e-07, "loss": 0.0001, "num_tokens": 8478667.0, "reward": 1.0750000476837158, "reward_std": 0.038729824125766754, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 251 }, { "completion_length": 2234.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3281.0, "completions/max_terminated_length": 3281.0, "completions/mean_length": 2234.416748046875, "completions/mean_terminated_length": 2234.416748046875, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 0.08548168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 3.4393929126963485e-07, "kl": 0.0, "learning_rate": 4.6532091097308485e-07, "loss": 0.0, "num_tokens": 8519382.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 252 }, { "completion_length": 2895.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5231.0, "completions/max_terminated_length": 5231.0, "completions/mean_length": 2895.83349609375, "completions/mean_terminated_length": 2895.83349609375, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "epoch": 0.08582089552238806, "frac_reward_zero_std": 0.0, "grad_norm": 0.6422632336616516, "kl": 0.0, "learning_rate": 4.651483781918564e-07, "loss": -0.0129, "num_tokens": 8563696.0, "reward": 0.9500000476837158, "reward_std": 0.2917786240577698, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 253 }, { "completion_length": 2033.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3983.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 2033.416748046875, "completions/mean_terminated_length": 2033.416748046875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.08616010854816825, "frac_reward_zero_std": 0.5, "grad_norm": 0.05713088810443878, "kl": 0.0, "learning_rate": 4.64975845410628e-07, "loss": -0.0011, "num_tokens": 8598651.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 254 }, { "completion_length": 1276.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 1276.0, "completions/mean_terminated_length": 1276.0, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.08649932157394843, "frac_reward_zero_std": 1.0, "grad_norm": 3.0110948046058184e-07, "kl": 0.0, "learning_rate": 4.648033126293996e-07, "loss": 0.0, "num_tokens": 8624469.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 255 }, { "completion_length": 2234.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4899.0, "completions/mean_length": 2783.166748046875, "completions/mean_terminated_length": 2437.181884765625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.08683853459972862, "frac_reward_zero_std": 0.0, "grad_norm": 0.11855963617563248, "kl": NaN, "learning_rate": 4.646307798481711e-07, "loss": -0.0076, "num_tokens": 8665972.0, "reward": 0.7749999761581421, "reward_std": 0.06123725324869156, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 256 }, { "completion_length": 2740.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5840.0, "completions/max_terminated_length": 5840.0, "completions/mean_length": 2740.25, "completions/mean_terminated_length": 2740.25, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.08717774762550883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.644582470669427e-07, "loss": 0.0, "num_tokens": 8714371.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 257 }, { "completion_length": 3532.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6254.0, "completions/mean_length": 5180.08349609375, "completions/mean_terminated_length": 4710.4443359375, "completions/min_length": 3218.0, "completions/min_terminated_length": 3218.0, "epoch": 0.08751696065128901, "frac_reward_zero_std": 0.0, "grad_norm": 0.22339831292629242, "kl": NaN, "learning_rate": 4.6428571428571427e-07, "loss": -0.016, "num_tokens": 8766629.0, "reward": 0.6750000715255737, "reward_std": 0.1218542754650116, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.11965861916542053, "step": 258 }, { "completion_length": 1992.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 1992.666748046875, "completions/mean_terminated_length": 1992.666748046875, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.0878561736770692, "frac_reward_zero_std": 0.0, "grad_norm": 0.47433897852897644, "kl": 0.0, "learning_rate": 4.641131815044858e-07, "loss": 0.0026, "num_tokens": 8805757.0, "reward": 0.5541666746139526, "reward_std": 0.2371777594089508, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 259 }, { "completion_length": 2260.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5259.0, "completions/max_terminated_length": 5259.0, "completions/mean_length": 2260.416748046875, "completions/mean_terminated_length": 2260.416748046875, "completions/min_length": 1118.0, "completions/min_terminated_length": 1118.0, "epoch": 0.08819538670284939, "frac_reward_zero_std": 0.5, "grad_norm": 0.11118398606777191, "kl": 0.0, "learning_rate": 4.639406487232574e-07, "loss": 0.0031, "num_tokens": 8846226.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 260 }, { "completion_length": 2100.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 2100.5, "completions/mean_terminated_length": 2100.5, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "epoch": 0.08853459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.14715582132339478, "kl": 0.0, "learning_rate": 4.63768115942029e-07, "loss": 0.0013, "num_tokens": 8881794.0, "reward": 1.1041667461395264, "reward_std": 0.05571504682302475, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 261 }, { "completion_length": 1970.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3088.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1970.916748046875, "completions/mean_terminated_length": 1970.916748046875, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.08887381275440977, "frac_reward_zero_std": 1.0, "grad_norm": 2.0765658348409488e-07, "kl": 0.0, "learning_rate": 4.6359558316080054e-07, "loss": 0.0, "num_tokens": 8914019.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 262 }, { "completion_length": 1055.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 1055.75, "completions/mean_terminated_length": 1055.75, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.08921302578018996, "frac_reward_zero_std": 0.5, "grad_norm": 0.059661220759153366, "kl": 0.0, "learning_rate": 4.634230503795721e-07, "loss": -0.0, "num_tokens": 8938184.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 263 }, { "completion_length": 2159.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3540.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 2159.666748046875, "completions/mean_terminated_length": 2159.666748046875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.08955223880597014, "frac_reward_zero_std": 1.0, "grad_norm": 1.261990263401458e-07, "kl": 0.0, "learning_rate": 4.6325051759834364e-07, "loss": 0.0, "num_tokens": 8976838.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 264 }, { "completion_length": 1059.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 1059.0, "completions/mean_terminated_length": 1059.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.08989145183175033, "frac_reward_zero_std": 1.0, "grad_norm": 2.0293516911351617e-07, "kl": 0.0, "learning_rate": 4.6307798481711525e-07, "loss": 0.0, "num_tokens": 9005260.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 265 }, { "completion_length": 1051.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1051.0, "completions/mean_terminated_length": 1051.0, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.09023066485753053, "frac_reward_zero_std": 0.5, "grad_norm": 0.35058966279029846, "kl": 0.0, "learning_rate": 4.629054520358868e-07, "loss": -0.0082, "num_tokens": 9026614.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 266 }, { "completion_length": 1659.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3826.0, "completions/max_terminated_length": 3826.0, "completions/mean_length": 1659.25, "completions/mean_terminated_length": 1659.25, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.09056987788331072, "frac_reward_zero_std": 0.5, "grad_norm": 0.540415346622467, "kl": 0.0, "learning_rate": 4.6273291925465835e-07, "loss": 0.0041, "num_tokens": 9057775.0, "reward": 1.0833334922790527, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 267 }, { "completion_length": 2549.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3837.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 2549.916748046875, "completions/mean_terminated_length": 2549.916748046875, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "epoch": 0.09090909090909091, "frac_reward_zero_std": 0.5, "grad_norm": 0.09202612936496735, "kl": 0.0, "learning_rate": 4.625603864734299e-07, "loss": -0.002, "num_tokens": 9100188.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 268 }, { "completion_length": 2686.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6176.0, "completions/max_terminated_length": 6176.0, "completions/mean_length": 2686.0, "completions/mean_terminated_length": 2686.0, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "epoch": 0.0912483039348711, "frac_reward_zero_std": 0.5, "grad_norm": 0.11565620452165604, "kl": 0.0, "learning_rate": 4.623878536922015e-07, "loss": 0.0012, "num_tokens": 9148038.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 269 }, { "completion_length": 1715.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4031.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1715.5833740234375, "completions/mean_terminated_length": 1715.5833740234375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.09158751696065129, "frac_reward_zero_std": 0.5, "grad_norm": 0.7160343527793884, "kl": 0.0, "learning_rate": 4.6221532091097307e-07, "loss": -0.0202, "num_tokens": 9179917.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 270 }, { "completion_length": 2181.0, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6022.0, "completions/mean_length": 4926.4169921875, "completions/mean_terminated_length": 3738.857421875, "completions/min_length": 1328.0, "completions/min_terminated_length": 1328.0, "epoch": 0.09192672998643148, "frac_reward_zero_std": 0.5, "grad_norm": 0.20487481355667114, "kl": NaN, "learning_rate": 4.620427881297446e-07, "loss": -0.019, "num_tokens": 9219079.0, "reward": 0.17500001192092896, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 271 }, { "completion_length": 2217.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3837.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 2217.916748046875, "completions/mean_terminated_length": 2217.916748046875, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.09226594301221167, "frac_reward_zero_std": 0.5, "grad_norm": 0.6833940744400024, "kl": 0.0, "learning_rate": 4.618702553485162e-07, "loss": 0.0011, "num_tokens": 9260148.0, "reward": 0.6000000238418579, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 272 }, { "completion_length": 3103.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4943.0, "completions/max_terminated_length": 4943.0, "completions/mean_length": 3103.5, "completions/mean_terminated_length": 3103.5, "completions/min_length": 2078.0, "completions/min_terminated_length": 2078.0, "epoch": 0.09260515603799185, "frac_reward_zero_std": 0.0, "grad_norm": 0.4680456817150116, "kl": 0.0, "learning_rate": 4.616977225672878e-07, "loss": -0.0159, "num_tokens": 9307752.0, "reward": 1.070833444595337, "reward_std": 0.2576434314250946, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 273 }, { "completion_length": 1276.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3375.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 1276.75, "completions/mean_terminated_length": 1276.75, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.09294436906377204, "frac_reward_zero_std": 1.0, "grad_norm": 2.443483992919937e-07, "kl": 0.0, "learning_rate": 4.6152518978605933e-07, "loss": 0.0, "num_tokens": 9331791.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 274 }, { "completion_length": 2395.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4769.0, "completions/max_terminated_length": 4769.0, "completions/mean_length": 2395.916748046875, "completions/mean_terminated_length": 2395.916748046875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 0.09328358208955224, "frac_reward_zero_std": 0.0, "grad_norm": 0.8109861612319946, "kl": 0.0, "learning_rate": 4.613526570048309e-07, "loss": 0.0024, "num_tokens": 9368888.0, "reward": 0.6375000476837158, "reward_std": 0.4031320810317993, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 275 }, { "completion_length": 2529.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5854.0, "completions/max_terminated_length": 5854.0, "completions/mean_length": 2529.916748046875, "completions/mean_terminated_length": 2529.916748046875, "completions/min_length": 1131.0, "completions/min_terminated_length": 1131.0, "epoch": 0.09362279511533243, "frac_reward_zero_std": 0.0, "grad_norm": 0.44502562284469604, "kl": 0.0, "learning_rate": 4.611801242236025e-07, "loss": 0.0014, "num_tokens": 9407575.0, "reward": 1.0499999523162842, "reward_std": 0.24738392233848572, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 276 }, { "completion_length": 2848.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4154.0, "completions/max_terminated_length": 4154.0, "completions/mean_length": 2848.33349609375, "completions/mean_terminated_length": 2848.33349609375, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "epoch": 0.09396200814111262, "frac_reward_zero_std": 0.0, "grad_norm": 0.13393691182136536, "kl": 0.0, "learning_rate": 4.6100759144237404e-07, "loss": -0.0004, "num_tokens": 9455825.0, "reward": 0.2625000476837158, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 277 }, { "completion_length": 3776.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6433.0, "completions/mean_length": 5423.9169921875, "completions/mean_terminated_length": 5035.5556640625, "completions/min_length": 3525.0, "completions/min_terminated_length": 3525.0, "epoch": 0.09430122116689281, "frac_reward_zero_std": 0.0, "grad_norm": 0.8137674927711487, "kl": NaN, "learning_rate": 4.608350586611456e-07, "loss": -0.0881, "num_tokens": 9508333.0, "reward": 0.720833420753479, "reward_std": 0.4936787486076355, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 278 }, { "completion_length": 2009.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5969.0, "completions/max_terminated_length": 5969.0, "completions/mean_length": 2009.3333740234375, "completions/mean_terminated_length": 2009.3333740234375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.094640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 0.8695447444915771, "kl": 0.0, "learning_rate": 4.6066252587991715e-07, "loss": 0.0393, "num_tokens": 9539867.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 279 }, { "completion_length": 1017.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 1017.5, "completions/mean_terminated_length": 1017.5, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.09497964721845319, "frac_reward_zero_std": 0.5, "grad_norm": 0.05056990310549736, "kl": 0.0, "learning_rate": 4.6048999309868875e-07, "loss": -0.001, "num_tokens": 9560039.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 280 }, { "completion_length": 2585.0001220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6297.0, "completions/mean_length": 3683.166748046875, "completions/mean_terminated_length": 3102.0, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.09531886024423337, "frac_reward_zero_std": 0.5, "grad_norm": 0.1040610745549202, "kl": NaN, "learning_rate": 4.6031746031746025e-07, "loss": -0.0198, "num_tokens": 9600029.0, "reward": 0.6625000834465027, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 281 }, { "completion_length": 2685.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5926.0, "completions/max_terminated_length": 5926.0, "completions/mean_length": 2685.83349609375, "completions/mean_terminated_length": 2685.83349609375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.09565807327001356, "frac_reward_zero_std": 0.0, "grad_norm": 0.6141899824142456, "kl": 0.0, "learning_rate": 4.6014492753623186e-07, "loss": -0.0061, "num_tokens": 9644097.0, "reward": 0.8875000476837158, "reward_std": 0.2698235511779785, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 282 }, { "completion_length": 2492.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6501.0, "completions/max_terminated_length": 6501.0, "completions/mean_length": 2492.58349609375, "completions/mean_terminated_length": 2492.58349609375, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.09599728629579375, "frac_reward_zero_std": 0.5, "grad_norm": 0.10037942975759506, "kl": 0.0, "learning_rate": 4.599723947550034e-07, "loss": -0.0009, "num_tokens": 9686560.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 283 }, { "completion_length": 1550.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 1550.916748046875, "completions/mean_terminated_length": 1550.916748046875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.09633649932157395, "frac_reward_zero_std": 0.5, "grad_norm": 0.05274030938744545, "kl": 0.0, "learning_rate": 4.59799861973775e-07, "loss": 0.0002, "num_tokens": 9716757.0, "reward": 1.1875, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 284 }, { "completion_length": 2718.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4146.0, "completions/max_terminated_length": 4146.0, "completions/mean_length": 2718.08349609375, "completions/mean_terminated_length": 2718.08349609375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "epoch": 0.09667571234735414, "frac_reward_zero_std": 0.5, "grad_norm": 0.087214395403862, "kl": 0.0, "learning_rate": 4.596273291925465e-07, "loss": -0.002, "num_tokens": 9761284.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 285 }, { "completion_length": 1447.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 1447.25, "completions/mean_terminated_length": 1447.25, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 0.09701492537313433, "frac_reward_zero_std": 1.0, "grad_norm": 3.683128113607381e-07, "kl": 0.0, "learning_rate": 4.594547964113181e-07, "loss": 0.0, "num_tokens": 9792013.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 286 }, { "completion_length": 1051.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2202.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 1051.0833740234375, "completions/mean_terminated_length": 1051.0833740234375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.09735413839891452, "frac_reward_zero_std": 1.0, "grad_norm": 1.0335989486520702e-07, "kl": 0.0, "learning_rate": 4.5928226363008973e-07, "loss": 0.0, "num_tokens": 9817568.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 287 }, { "completion_length": 2563.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5924.0, "completions/mean_length": 4211.08349609375, "completions/mean_terminated_length": 3418.444580078125, "completions/min_length": 1609.0, "completions/min_terminated_length": 1609.0, "epoch": 0.09769335142469471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8279024958610535, "kl": NaN, "learning_rate": 4.591097308488613e-07, "loss": -0.0875, "num_tokens": 9862596.0, "reward": 0.8041667342185974, "reward_std": 0.3116154074668884, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 288 }, { "completion_length": 3452.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5503.0, "completions/max_terminated_length": 5503.0, "completions/mean_length": 3452.83349609375, "completions/mean_terminated_length": 3452.83349609375, "completions/min_length": 2014.0, "completions/min_terminated_length": 2014.0, "epoch": 0.0980325644504749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.5893719806763283e-07, "loss": 0.0, "num_tokens": 9915352.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 289 }, { "completion_length": 1157.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1157.416748046875, "completions/mean_terminated_length": 1157.416748046875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.09837177747625508, "frac_reward_zero_std": 0.5, "grad_norm": 0.06177419424057007, "kl": 0.0, "learning_rate": 4.587646652864044e-07, "loss": 0.0001, "num_tokens": 9939849.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 290 }, { "completion_length": 595.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 595.25, "completions/mean_terminated_length": 595.25, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.09871099050203527, "frac_reward_zero_std": 0.0, "grad_norm": 0.08985080569982529, "kl": 0.0, "learning_rate": 4.58592132505176e-07, "loss": 0.0007, "num_tokens": 9958920.0, "reward": 0.7583333849906921, "reward_std": 0.07955463975667953, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 291 }, { "completion_length": 2860.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5107.0, "completions/max_terminated_length": 5107.0, "completions/mean_length": 2860.666748046875, "completions/mean_terminated_length": 2860.666748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.09905020352781546, "frac_reward_zero_std": 0.0, "grad_norm": 0.6327357292175293, "kl": 0.0, "learning_rate": 4.584195997239475e-07, "loss": -0.0131, "num_tokens": 9998972.0, "reward": 0.7666667699813843, "reward_std": 0.36985844373703003, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 292 }, { "completion_length": 2198.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4135.0, "completions/max_terminated_length": 4135.0, "completions/mean_length": 2198.83349609375, "completions/mean_terminated_length": 2198.83349609375, "completions/min_length": 1079.0, "completions/min_terminated_length": 1079.0, "epoch": 0.09938941655359566, "frac_reward_zero_std": 0.0, "grad_norm": 0.14555297791957855, "kl": 0.0, "learning_rate": 4.582470669427191e-07, "loss": -0.004, "num_tokens": 10040952.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 293 }, { "completion_length": 1870.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5196.0, "completions/max_terminated_length": 5196.0, "completions/mean_length": 1870.0833740234375, "completions/mean_terminated_length": 1870.0833740234375, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.09972862957937585, "frac_reward_zero_std": 0.5, "grad_norm": 0.6394572257995605, "kl": 0.0, "learning_rate": 4.5807453416149065e-07, "loss": 0.0048, "num_tokens": 10071475.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 294 }, { "completion_length": 2610.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5046.0, "completions/mean_length": 3709.08349609375, "completions/mean_terminated_length": 3133.10009765625, "completions/min_length": 1825.0, "completions/min_terminated_length": 1825.0, "epoch": 0.10006784260515604, "frac_reward_zero_std": 0.0, "grad_norm": 0.9850728511810303, "kl": NaN, "learning_rate": 4.5790200138026226e-07, "loss": -0.0765, "num_tokens": 10118868.0, "reward": 0.8333333730697632, "reward_std": 0.5316232442855835, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 295 }, { "completion_length": 3167.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6466.0, "completions/mean_length": 4814.9169921875, "completions/mean_terminated_length": 4223.5556640625, "completions/min_length": 1487.0, "completions/min_terminated_length": 1487.0, "epoch": 0.10040705563093623, "frac_reward_zero_std": 0.5, "grad_norm": 0.3844717741012573, "kl": NaN, "learning_rate": 4.5772946859903376e-07, "loss": -0.0294, "num_tokens": 10169294.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 296 }, { "completion_length": 2613.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4815.0, "completions/max_terminated_length": 4815.0, "completions/mean_length": 2613.0, "completions/mean_terminated_length": 2613.0, "completions/min_length": 1343.0, "completions/min_terminated_length": 1343.0, "epoch": 0.10074626865671642, "frac_reward_zero_std": 0.5, "grad_norm": 0.12785717844963074, "kl": 0.0, "learning_rate": 4.5755693581780536e-07, "loss": 0.0003, "num_tokens": 10213808.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 297 }, { "completion_length": 1011.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1011.9166870117188, "completions/mean_terminated_length": 1011.9166870117188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.1010854816824966, "frac_reward_zero_std": 0.5, "grad_norm": 0.48920974135398865, "kl": 0.0, "learning_rate": 4.573844030365769e-07, "loss": -0.0006, "num_tokens": 10236667.0, "reward": 1.2041666507720947, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 298 }, { "completion_length": 3067.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6274.0, "completions/max_terminated_length": 6274.0, "completions/mean_length": 3067.5, "completions/mean_terminated_length": 3067.5, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "epoch": 0.1014246947082768, "frac_reward_zero_std": 0.5, "grad_norm": 0.07078174501657486, "kl": 0.0, "learning_rate": 4.572118702553485e-07, "loss": -0.0003, "num_tokens": 10285699.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 299 }, { "completion_length": 2652.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4168.0, "completions/max_terminated_length": 4168.0, "completions/mean_length": 2652.25, "completions/mean_terminated_length": 2652.25, "completions/min_length": 1339.0, "completions/min_terminated_length": 1339.0, "epoch": 0.10176390773405698, "frac_reward_zero_std": 0.5, "grad_norm": 0.06933386623859406, "kl": 0.0, "learning_rate": 4.5703933747412e-07, "loss": -0.0005, "num_tokens": 10330456.0, "reward": 0.6750000715255737, "reward_std": 0.03872981667518616, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 300 }, { "completion_length": 489.00001525878906, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 3783.5, "completions/mean_terminated_length": 978.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.10210312075983717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.568668046928916e-07, "loss": 0.0, "num_tokens": 10348420.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 301 }, { "completion_length": 3680.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5165.0, "completions/max_terminated_length": 5165.0, "completions/mean_length": 3680.33349609375, "completions/mean_terminated_length": 3680.33349609375, "completions/min_length": 1925.0, "completions/min_terminated_length": 1925.0, "epoch": 0.10244233378561737, "frac_reward_zero_std": 1.0, "grad_norm": 4.457709223970596e-07, "kl": 0.0, "learning_rate": 4.5669427191166323e-07, "loss": 0.0, "num_tokens": 10402940.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 302 }, { "completion_length": 1168.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1168.0, "completions/mean_terminated_length": 1168.0, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.10278154681139756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5187942981719971, "kl": 0.0, "learning_rate": 4.5652173913043473e-07, "loss": -0.0028, "num_tokens": 10434620.0, "reward": 1.0375001430511475, "reward_std": 0.2883797585964203, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 303 }, { "completion_length": 1859.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3710.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 1859.916748046875, "completions/mean_terminated_length": 1859.916748046875, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.10312075983717775, "frac_reward_zero_std": 0.5, "grad_norm": 0.17270970344543457, "kl": 0.0, "learning_rate": 4.5634920634920634e-07, "loss": -0.0001, "num_tokens": 10468945.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 304 }, { "completion_length": 2067.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3797.0, "completions/max_terminated_length": 3797.0, "completions/mean_length": 2067.166748046875, "completions/mean_terminated_length": 2067.166748046875, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.10345997286295794, "frac_reward_zero_std": 1.0, "grad_norm": 2.4897084927033575e-07, "kl": 0.0, "learning_rate": 4.561766735679779e-07, "loss": 0.0, "num_tokens": 10505637.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 305 }, { "completion_length": 769.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 769.25, "completions/mean_terminated_length": 769.25, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.10379918588873813, "frac_reward_zero_std": 1.0, "grad_norm": 9.078951990204587e-08, "kl": 0.0, "learning_rate": 4.560041407867495e-07, "loss": 0.0, "num_tokens": 10528476.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 306 }, { "completion_length": 2807.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4502.0, "completions/mean_length": 3356.666748046875, "completions/mean_terminated_length": 3062.818359375, "completions/min_length": 1536.0, "completions/min_terminated_length": 1536.0, "epoch": 0.10413839891451832, "frac_reward_zero_std": 0.5, "grad_norm": 0.09662654250860214, "kl": NaN, "learning_rate": 4.55831608005521e-07, "loss": -0.0059, "num_tokens": 10573567.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 307 }, { "completion_length": 971.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 971.25, "completions/mean_terminated_length": 971.25, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.1044776119402985, "frac_reward_zero_std": 0.5, "grad_norm": 0.07163669914007187, "kl": 0.0, "learning_rate": 4.556590752242926e-07, "loss": 0.0004, "num_tokens": 10592332.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 308 }, { "completion_length": 3025.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5822.0, "completions/mean_length": 3574.666748046875, "completions/mean_terminated_length": 3300.636474609375, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 0.10481682496607869, "frac_reward_zero_std": 0.5, "grad_norm": 0.5091751217842102, "kl": NaN, "learning_rate": 4.5548654244306415e-07, "loss": -0.0043, "num_tokens": 10640033.0, "reward": 0.770833432674408, "reward_std": 0.2123773992061615, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 309 }, { "completion_length": 1609.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 1609.916748046875, "completions/mean_terminated_length": 1609.916748046875, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.10515603799185888, "frac_reward_zero_std": 0.5, "grad_norm": 0.07517128437757492, "kl": 0.0, "learning_rate": 4.5531400966183576e-07, "loss": -0.0007, "num_tokens": 10670926.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 310 }, { "completion_length": 3589.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4867.0, "completions/max_terminated_length": 4867.0, "completions/mean_length": 3589.75, "completions/mean_terminated_length": 3589.75, "completions/min_length": 2861.0, "completions/min_terminated_length": 2861.0, "epoch": 0.10549525101763908, "frac_reward_zero_std": 0.0, "grad_norm": 0.1443360447883606, "kl": 0.0, "learning_rate": 4.5514147688060726e-07, "loss": -0.0008, "num_tokens": 10727071.0, "reward": 0.7041667699813843, "reward_std": 0.07144345343112946, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 311 }, { "completion_length": 1537.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4027.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 1537.3333740234375, "completions/mean_terminated_length": 1537.3333740234375, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.10583446404341927, "frac_reward_zero_std": 0.0, "grad_norm": 0.42037126421928406, "kl": 0.0, "learning_rate": 4.5496894409937887e-07, "loss": 0.0139, "num_tokens": 10761491.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 312 }, { "completion_length": 2531.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4342.0, "completions/max_terminated_length": 4342.0, "completions/mean_length": 2531.75, "completions/mean_terminated_length": 2531.75, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.10617367706919946, "frac_reward_zero_std": 1.0, "grad_norm": 1.324536924585118e-07, "kl": 0.0, "learning_rate": 4.547964113181504e-07, "loss": 0.0, "num_tokens": 10803350.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 313 }, { "completion_length": 3405.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6562.0, "completions/mean_length": 3954.5, "completions/mean_terminated_length": 3715.0, "completions/min_length": 1611.0, "completions/min_terminated_length": 1611.0, "epoch": 0.10651289009497965, "frac_reward_zero_std": 0.0, "grad_norm": 0.647424578666687, "kl": NaN, "learning_rate": 4.5462387853692197e-07, "loss": -0.0113, "num_tokens": 10858477.0, "reward": 0.6791666746139526, "reward_std": 0.2734726071357727, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 314 }, { "completion_length": 2428.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5417.0, "completions/max_terminated_length": 5417.0, "completions/mean_length": 2428.166748046875, "completions/mean_terminated_length": 2428.166748046875, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.10685210312075984, "frac_reward_zero_std": 0.0, "grad_norm": 0.5885283350944519, "kl": 0.0, "learning_rate": 4.544513457556935e-07, "loss": 0.023, "num_tokens": 10903773.0, "reward": 1.1166667938232422, "reward_std": 0.24571877717971802, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.27579089999198914, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 315 }, { "completion_length": 3376.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6365.0, "completions/max_terminated_length": 6365.0, "completions/mean_length": 3376.166748046875, "completions/mean_terminated_length": 3376.166748046875, "completions/min_length": 1547.0, "completions/min_terminated_length": 1547.0, "epoch": 0.10719131614654002, "frac_reward_zero_std": 0.5, "grad_norm": 0.15940889716148376, "kl": 0.0, "learning_rate": 4.5427881297446513e-07, "loss": -0.0027, "num_tokens": 10957691.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 316 }, { "completion_length": 1784.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4751.0, "completions/max_terminated_length": 4751.0, "completions/mean_length": 1784.8333740234375, "completions/mean_terminated_length": 1784.8333740234375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.10753052917232021, "frac_reward_zero_std": 0.5, "grad_norm": 0.41797852516174316, "kl": 0.0, "learning_rate": 4.5410628019323674e-07, "loss": 0.0045, "num_tokens": 10988949.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 317 }, { "completion_length": 1927.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3727.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 1927.25, "completions/mean_terminated_length": 1927.25, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.1078697421981004, "frac_reward_zero_std": 0.5, "grad_norm": 0.07680307328701019, "kl": 0.0, "learning_rate": 4.5393374741200824e-07, "loss": -0.0008, "num_tokens": 11024550.0, "reward": 1.1375000476837158, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 318 }, { "completion_length": 954.4166717529297, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5713.0, "completions/mean_length": 3699.83349609375, "completions/mean_terminated_length": 1636.1429443359375, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.10820895522388059, "frac_reward_zero_std": 0.0, "grad_norm": 0.2716827988624573, "kl": NaN, "learning_rate": 4.5376121463077984e-07, "loss": -0.0151, "num_tokens": 11048717.0, "reward": 0.6833333373069763, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.133143812417984, "step": 319 }, { "completion_length": 2118.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 6177.0, "completions/max_terminated_length": 6177.0, "completions/mean_length": 2118.58349609375, "completions/mean_terminated_length": 2118.58349609375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.10854816824966079, "frac_reward_zero_std": 0.0, "grad_norm": 0.16588005423545837, "kl": 0.0, "learning_rate": 4.535886818495514e-07, "loss": 0.0046, "num_tokens": 11085858.0, "reward": 1.1666667461395264, "reward_std": 0.09559707343578339, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 320 }, { "completion_length": 3266.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4822.0, "completions/max_terminated_length": 4822.0, "completions/mean_length": 3266.5, "completions/mean_terminated_length": 3266.5, "completions/min_length": 2462.0, "completions/min_terminated_length": 2462.0, "epoch": 0.10888738127544098, "frac_reward_zero_std": 0.5, "grad_norm": 0.137014701962471, "kl": 0.0, "learning_rate": 4.53416149068323e-07, "loss": 0.0032, "num_tokens": 11136462.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 321 }, { "completion_length": 2219.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5317.0, "completions/mean_length": 2768.08349609375, "completions/mean_terminated_length": 2420.727294921875, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.10922659430122117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0347485542297363, "kl": NaN, "learning_rate": 4.532436162870945e-07, "loss": -0.049, "num_tokens": 11172702.0, "reward": 0.9375001192092896, "reward_std": 0.28885549306869507, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 322 }, { "completion_length": 1779.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3227.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 1779.75, "completions/mean_terminated_length": 1779.75, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 0.10956580732700136, "frac_reward_zero_std": 0.5, "grad_norm": 0.34098246693611145, "kl": 0.0, "learning_rate": 4.530710835058661e-07, "loss": -0.0022, "num_tokens": 11204241.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 323 }, { "completion_length": 2422.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4466.0, "completions/max_terminated_length": 4466.0, "completions/mean_length": 2422.75, "completions/mean_terminated_length": 2422.75, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.10990502035278155, "frac_reward_zero_std": 0.5, "grad_norm": 0.12155548483133316, "kl": 0.0, "learning_rate": 4.5289855072463766e-07, "loss": 0.0057, "num_tokens": 11242524.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 324 }, { "completion_length": 1740.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1740.25, "completions/mean_terminated_length": 1740.25, "completions/min_length": 1336.0, "completions/min_terminated_length": 1336.0, "epoch": 0.11024423337856173, "frac_reward_zero_std": 1.0, "grad_norm": 1.1897661522652925e-07, "kl": 0.0, "learning_rate": 4.527260179434092e-07, "loss": 0.0, "num_tokens": 11277657.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 325 }, { "completion_length": 1775.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 5753.0, "completions/max_terminated_length": 5753.0, "completions/mean_length": 1775.8333740234375, "completions/mean_terminated_length": 1775.8333740234375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.11058344640434192, "frac_reward_zero_std": 0.0, "grad_norm": 0.16949614882469177, "kl": 0.0, "learning_rate": 4.5255348516218076e-07, "loss": 0.0069, "num_tokens": 11312557.0, "reward": 1.1875, "reward_std": 0.09653984010219574, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 326 }, { "completion_length": 930.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 930.0833740234375, "completions/mean_terminated_length": 930.0833740234375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.11092265943012211, "frac_reward_zero_std": 0.0, "grad_norm": 0.2900066673755646, "kl": 0.0, "learning_rate": 4.5238095238095237e-07, "loss": -0.0033, "num_tokens": 11334668.0, "reward": 1.0375001430511475, "reward_std": 0.23474279046058655, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 327 }, { "completion_length": 2132.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4965.0, "completions/mean_length": 2681.416748046875, "completions/mean_terminated_length": 2326.181884765625, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.1112618724559023, "frac_reward_zero_std": 0.5, "grad_norm": 0.5346319079399109, "kl": NaN, "learning_rate": 4.522084195997239e-07, "loss": -0.0373, "num_tokens": 11372550.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 328 }, { "completion_length": 1133.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3432.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 1133.25, "completions/mean_terminated_length": 1133.25, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.1116010854816825, "frac_reward_zero_std": 1.0, "grad_norm": 2.394129978711135e-07, "kl": 0.0, "learning_rate": 4.520358868184955e-07, "loss": 0.0, "num_tokens": 11402877.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 329 }, { "completion_length": 3503.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4891.0, "completions/max_terminated_length": 4891.0, "completions/mean_length": 3503.916748046875, "completions/mean_terminated_length": 3503.916748046875, "completions/min_length": 2531.0, "completions/min_terminated_length": 2531.0, "epoch": 0.11194029850746269, "frac_reward_zero_std": 0.5, "grad_norm": 0.5139081478118896, "kl": 0.0, "learning_rate": 4.5186335403726703e-07, "loss": 0.0014, "num_tokens": 11455586.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 330 }, { "completion_length": 752.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 752.1666870117188, "completions/mean_terminated_length": 752.1666870117188, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.11227951153324288, "frac_reward_zero_std": 0.5, "grad_norm": 0.2810387909412384, "kl": 0.0, "learning_rate": 4.5169082125603863e-07, "loss": -0.0016, "num_tokens": 11472094.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 331 }, { "completion_length": 3109.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6303.0, "completions/max_terminated_length": 6303.0, "completions/mean_length": 3109.666748046875, "completions/mean_terminated_length": 3109.666748046875, "completions/min_length": 1302.0, "completions/min_terminated_length": 1302.0, "epoch": 0.11261872455902307, "frac_reward_zero_std": 0.5, "grad_norm": 0.49435505270957947, "kl": 0.0, "learning_rate": 4.5151828847481024e-07, "loss": -0.0082, "num_tokens": 11521260.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 332 }, { "completion_length": 2569.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5876.0, "completions/mean_length": 3118.33349609375, "completions/mean_terminated_length": 2802.818359375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.11295793758480326, "frac_reward_zero_std": 0.5, "grad_norm": 1.2554776668548584, "kl": NaN, "learning_rate": 4.5134575569358174e-07, "loss": -0.0563, "num_tokens": 11561421.0, "reward": 0.5125000476837158, "reward_std": 0.3184925317764282, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 333 }, { "completion_length": 1469.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1469.916748046875, "completions/mean_terminated_length": 1469.916748046875, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.11329715061058344, "frac_reward_zero_std": 0.5, "grad_norm": 0.08878318965435028, "kl": 0.0, "learning_rate": 4.5117322291235335e-07, "loss": 0.0007, "num_tokens": 11590916.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 334 }, { "completion_length": 2060.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3121.0, "completions/max_terminated_length": 3121.0, "completions/mean_length": 2060.33349609375, "completions/mean_terminated_length": 2060.33349609375, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.11363636363636363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.510006901311249e-07, "loss": 0.0, "num_tokens": 11627238.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 335 }, { "completion_length": 2561.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3956.0, "completions/max_terminated_length": 3956.0, "completions/mean_length": 2561.0, "completions/mean_terminated_length": 2561.0, "completions/min_length": 1604.0, "completions/min_terminated_length": 1604.0, "epoch": 0.11397557666214382, "frac_reward_zero_std": 0.0, "grad_norm": 1.5772103071212769, "kl": 0.0, "learning_rate": 4.508281573498965e-07, "loss": 0.0067, "num_tokens": 11666136.0, "reward": 1.1500000953674316, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 336 }, { "completion_length": 1338.5833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6187.0, "completions/mean_length": 2985.83349609375, "completions/mean_terminated_length": 1784.77783203125, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.11431478968792401, "frac_reward_zero_std": 0.5, "grad_norm": 0.14361168444156647, "kl": NaN, "learning_rate": 4.50655624568668e-07, "loss": -0.0182, "num_tokens": 11693431.0, "reward": 0.6250001192092896, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 337 }, { "completion_length": 1112.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3285.0, "completions/max_terminated_length": 3285.0, "completions/mean_length": 1112.666748046875, "completions/mean_terminated_length": 1112.666748046875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.11465400271370421, "frac_reward_zero_std": 0.5, "grad_norm": 0.3458263874053955, "kl": 0.0, "learning_rate": 4.504830917874396e-07, "loss": -0.0033, "num_tokens": 11719575.0, "reward": 1.0375001430511475, "reward_std": 0.17446348071098328, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 338 }, { "completion_length": 2146.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5674.0, "completions/max_terminated_length": 5674.0, "completions/mean_length": 2146.58349609375, "completions/mean_terminated_length": 2146.58349609375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.1149932157394844, "frac_reward_zero_std": 0.0, "grad_norm": 0.5401544570922852, "kl": 0.0, "learning_rate": 4.5031055900621116e-07, "loss": 0.0189, "num_tokens": 11756044.0, "reward": 0.6708333492279053, "reward_std": 0.22469764947891235, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 339 }, { "completion_length": 2249.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4573.0, "completions/max_terminated_length": 4573.0, "completions/mean_length": 2249.166748046875, "completions/mean_terminated_length": 2249.166748046875, "completions/min_length": 1230.0, "completions/min_terminated_length": 1230.0, "epoch": 0.11533242876526459, "frac_reward_zero_std": 0.5, "grad_norm": 0.08829537034034729, "kl": 0.0, "learning_rate": 4.501380262249827e-07, "loss": -0.0025, "num_tokens": 11795994.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 340 }, { "completion_length": 2166.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4724.0, "completions/max_terminated_length": 4724.0, "completions/mean_length": 2166.0, "completions/mean_terminated_length": 2166.0, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.11567164179104478, "frac_reward_zero_std": 0.5, "grad_norm": 0.09921236336231232, "kl": 0.0, "learning_rate": 4.4996549344375427e-07, "loss": 0.0037, "num_tokens": 11834874.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 341 }, { "completion_length": 1815.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5702.0, "completions/mean_length": 3463.08349609375, "completions/mean_terminated_length": 2421.111083984375, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 0.11601085481682497, "frac_reward_zero_std": 0.5, "grad_norm": 0.7487086653709412, "kl": NaN, "learning_rate": 4.497929606625259e-07, "loss": -0.0992, "num_tokens": 11865634.0, "reward": 0.9750000238418579, "reward_std": 0.35601967573165894, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 342 }, { "completion_length": 1490.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4809.0, "completions/max_terminated_length": 4809.0, "completions/mean_length": 1490.25, "completions/mean_terminated_length": 1490.25, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 0.11635006784260515, "frac_reward_zero_std": 0.5, "grad_norm": 0.07796503603458405, "kl": 0.0, "learning_rate": 4.4962042788129743e-07, "loss": 0.0001, "num_tokens": 11891383.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 343 }, { "completion_length": 1805.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 1805.3333740234375, "completions/mean_terminated_length": 1805.3333740234375, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 0.11668928086838534, "frac_reward_zero_std": 1.0, "grad_norm": 2.4056276970441104e-07, "kl": 0.0, "learning_rate": 4.49447895100069e-07, "loss": 0.0, "num_tokens": 11928323.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 344 }, { "completion_length": 2549.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4606.0, "completions/max_terminated_length": 4606.0, "completions/mean_length": 2549.83349609375, "completions/mean_terminated_length": 2549.83349609375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.11702849389416553, "frac_reward_zero_std": 0.0, "grad_norm": 0.6675385236740112, "kl": 0.0, "learning_rate": 4.4927536231884053e-07, "loss": 0.0182, "num_tokens": 11969691.0, "reward": 1.0500000715255737, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 345 }, { "completion_length": 3366.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6368.0, "completions/mean_length": 5013.9169921875, "completions/mean_terminated_length": 4488.88916015625, "completions/min_length": 1588.0, "completions/min_terminated_length": 1588.0, "epoch": 0.11736770691994572, "frac_reward_zero_std": 0.0, "grad_norm": 0.836334228515625, "kl": NaN, "learning_rate": 4.4910282953761214e-07, "loss": -0.0777, "num_tokens": 12023327.0, "reward": 0.6041667461395264, "reward_std": 0.29333966970443726, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 346 }, { "completion_length": 1315.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 1315.916748046875, "completions/mean_terminated_length": 1315.916748046875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.11770691994572592, "frac_reward_zero_std": 1.0, "grad_norm": 2.1746159006852395e-07, "kl": 0.0, "learning_rate": 4.4893029675638374e-07, "loss": 0.0, "num_tokens": 12052486.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 347 }, { "completion_length": 3217.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6038.0, "completions/max_terminated_length": 6038.0, "completions/mean_length": 3217.666748046875, "completions/mean_terminated_length": 3217.666748046875, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.11804613297150611, "frac_reward_zero_std": 0.0, "grad_norm": 0.5589861273765564, "kl": 0.0, "learning_rate": 4.4875776397515524e-07, "loss": -0.004, "num_tokens": 12104628.0, "reward": 1.0875000953674316, "reward_std": 0.2607758939266205, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 348 }, { "completion_length": 1836.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5143.0, "completions/max_terminated_length": 5143.0, "completions/mean_length": 1836.5833740234375, "completions/mean_terminated_length": 1836.5833740234375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.1183853459972863, "frac_reward_zero_std": 0.5, "grad_norm": 0.10859017074108124, "kl": 0.0, "learning_rate": 4.4858523119392685e-07, "loss": 0.0043, "num_tokens": 12135391.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 349 }, { "completion_length": 2148.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3424.0, "completions/max_terminated_length": 3424.0, "completions/mean_length": 2148.08349609375, "completions/mean_terminated_length": 2148.08349609375, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.11872455902306649, "frac_reward_zero_std": 0.5, "grad_norm": 0.08867096155881882, "kl": 0.0, "learning_rate": 4.484126984126984e-07, "loss": 0.0015, "num_tokens": 12173024.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 350 }, { "completion_length": 1849.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3647.0, "completions/max_terminated_length": 3647.0, "completions/mean_length": 1849.666748046875, "completions/mean_terminated_length": 1849.666748046875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.11906377204884667, "frac_reward_zero_std": 0.5, "grad_norm": 0.13948985934257507, "kl": 0.0, "learning_rate": 4.4824016563146996e-07, "loss": 0.0055, "num_tokens": 12207754.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 351 }, { "completion_length": 1228.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1228.916748046875, "completions/mean_terminated_length": 1228.916748046875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 0.11940298507462686, "frac_reward_zero_std": 0.5, "grad_norm": 0.3335708677768707, "kl": 0.0, "learning_rate": 4.480676328502415e-07, "loss": -0.0039, "num_tokens": 12235113.0, "reward": 1.1375000476837158, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 352 }, { "completion_length": 1891.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4444.0, "completions/max_terminated_length": 4444.0, "completions/mean_length": 1891.416748046875, "completions/mean_terminated_length": 1891.416748046875, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.11974219810040705, "frac_reward_zero_std": 0.5, "grad_norm": 0.3772743046283722, "kl": 0.0, "learning_rate": 4.478951000690131e-07, "loss": -0.0017, "num_tokens": 12268598.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 353 }, { "completion_length": 1879.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5891.0, "completions/max_terminated_length": 5891.0, "completions/mean_length": 1879.8333740234375, "completions/mean_terminated_length": 1879.8333740234375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.12008141112618724, "frac_reward_zero_std": 0.0, "grad_norm": 0.9070640802383423, "kl": 0.0, "learning_rate": 4.4772256728778467e-07, "loss": -0.0369, "num_tokens": 12300792.0, "reward": 0.3583333492279053, "reward_std": 0.24285396933555603, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 354 }, { "completion_length": 1915.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4033.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 1915.0, "completions/mean_terminated_length": 1915.0, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.12042062415196743, "frac_reward_zero_std": 0.5, "grad_norm": 0.09051363170146942, "kl": 0.0, "learning_rate": 4.475500345065562e-07, "loss": -0.0006, "num_tokens": 12334404.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 355 }, { "completion_length": 1688.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 1688.0833740234375, "completions/mean_terminated_length": 1688.0833740234375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.12075983717774763, "frac_reward_zero_std": 0.5, "grad_norm": 0.0811961218714714, "kl": 0.0, "learning_rate": 4.4737750172532777e-07, "loss": 0.0016, "num_tokens": 12362599.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 356 }, { "completion_length": 1835.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3861.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 1835.166748046875, "completions/mean_terminated_length": 1835.166748046875, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.12109905020352782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.472049689440994e-07, "loss": 0.0, "num_tokens": 12398781.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 357 }, { "completion_length": 983.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 983.75, "completions/mean_terminated_length": 983.75, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.12143826322930801, "frac_reward_zero_std": 1.0, "grad_norm": 1.9967588116287516e-07, "kl": 0.0, "learning_rate": 4.4703243616287093e-07, "loss": 0.0, "num_tokens": 12422490.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 358 }, { "completion_length": 3115.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6423.0, "completions/mean_length": 4213.9169921875, "completions/mean_terminated_length": 3738.900146484375, "completions/min_length": 1909.0, "completions/min_terminated_length": 1909.0, "epoch": 0.1217774762550882, "frac_reward_zero_std": 0.0, "grad_norm": 0.9891613721847534, "kl": NaN, "learning_rate": 4.468599033816425e-07, "loss": -0.0521, "num_tokens": 12475245.0, "reward": 0.783333420753479, "reward_std": 0.2532995343208313, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 359 }, { "completion_length": 2723.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4374.0, "completions/max_terminated_length": 4374.0, "completions/mean_length": 2723.166748046875, "completions/mean_terminated_length": 2723.166748046875, "completions/min_length": 1459.0, "completions/min_terminated_length": 1459.0, "epoch": 0.12211668928086838, "frac_reward_zero_std": 1.0, "grad_norm": 1.2058346499088657e-07, "kl": 0.0, "learning_rate": 4.4668737060041404e-07, "loss": 0.0, "num_tokens": 12520649.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 360 }, { "completion_length": 1992.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5684.0, "completions/mean_length": 2541.5, "completions/mean_terminated_length": 2173.54541015625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.12245590230664857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2540488541126251, "kl": NaN, "learning_rate": 4.4651483781918564e-07, "loss": -0.0095, "num_tokens": 12556066.0, "reward": 0.6791667342185974, "reward_std": 0.10357433557510376, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 361 }, { "completion_length": 2420.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5967.0, "completions/mean_length": 2969.666748046875, "completions/mean_terminated_length": 2640.636474609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.12279511533242876, "frac_reward_zero_std": 0.0, "grad_norm": 0.33543863892555237, "kl": NaN, "learning_rate": 4.4634230503795714e-07, "loss": -0.0596, "num_tokens": 12599795.0, "reward": 1.0958333015441895, "reward_std": 0.2968290448188782, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 362 }, { "completion_length": 2000.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4281.0, "completions/max_terminated_length": 4281.0, "completions/mean_length": 2000.166748046875, "completions/mean_terminated_length": 2000.166748046875, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.12313432835820895, "frac_reward_zero_std": 0.5, "grad_norm": 0.47814515233039856, "kl": 0.0, "learning_rate": 4.4616977225672875e-07, "loss": 0.0103, "num_tokens": 12634171.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941503047943, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 363 }, { "completion_length": 2920.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 6174.0, "completions/max_terminated_length": 6174.0, "completions/mean_length": 2920.33349609375, "completions/mean_terminated_length": 2920.33349609375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.12347354138398914, "frac_reward_zero_std": 0.0, "grad_norm": 0.5874470472335815, "kl": 0.0, "learning_rate": 4.4599723947550035e-07, "loss": 0.0089, "num_tokens": 12684635.0, "reward": 1.133333444595337, "reward_std": 0.25163978338241577, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 364 }, { "completion_length": 2071.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5161.0, "completions/mean_length": 3169.83349609375, "completions/mean_terminated_length": 2486.0, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.12381275440976934, "frac_reward_zero_std": 0.5, "grad_norm": 0.7931068539619446, "kl": NaN, "learning_rate": 4.458247066942719e-07, "loss": -0.0785, "num_tokens": 12722335.0, "reward": 0.5166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 365 }, { "completion_length": 544.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 544.8333740234375, "completions/mean_terminated_length": 544.8333740234375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.12415196743554953, "frac_reward_zero_std": 0.5, "grad_norm": 0.2730981707572937, "kl": 0.0, "learning_rate": 4.4565217391304346e-07, "loss": -0.0016, "num_tokens": 12739961.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 366 }, { "completion_length": 2281.8334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5693.0, "completions/mean_length": 2830.916748046875, "completions/mean_terminated_length": 2489.272705078125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.12449118046132972, "frac_reward_zero_std": 0.5, "grad_norm": 0.7804614305496216, "kl": NaN, "learning_rate": 4.45479641131815e-07, "loss": -0.0158, "num_tokens": 12782919.0, "reward": 0.8416666984558105, "reward_std": 0.1855172961950302, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 367 }, { "completion_length": 1517.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 1517.416748046875, "completions/mean_terminated_length": 1517.416748046875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.1248303934871099, "frac_reward_zero_std": 1.0, "grad_norm": 1.2498109924763412e-07, "kl": 0.0, "learning_rate": 4.453071083505866e-07, "loss": 0.0, "num_tokens": 12811460.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 368 }, { "completion_length": 2193.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4981.0, "completions/mean_length": 3291.75, "completions/mean_terminated_length": 2632.300048828125, "completions/min_length": 1154.0, "completions/min_terminated_length": 1154.0, "epoch": 0.1251696065128901, "frac_reward_zero_std": 0.5, "grad_norm": 0.574193000793457, "kl": NaN, "learning_rate": 4.4513457556935817e-07, "loss": -0.0544, "num_tokens": 12849537.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 369 }, { "completion_length": 2472.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5330.0, "completions/max_terminated_length": 5330.0, "completions/mean_length": 2472.666748046875, "completions/mean_terminated_length": 2472.666748046875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 0.1255088195386703, "frac_reward_zero_std": 1.0, "grad_norm": 3.123369936020026e-07, "kl": 0.0, "learning_rate": 4.449620427881297e-07, "loss": 0.0, "num_tokens": 12895625.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 370 }, { "completion_length": 3849.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5591.0, "completions/max_terminated_length": 5591.0, "completions/mean_length": 3849.166748046875, "completions/mean_terminated_length": 3849.166748046875, "completions/min_length": 2576.0, "completions/min_terminated_length": 2576.0, "epoch": 0.12584803256445048, "frac_reward_zero_std": 0.5, "grad_norm": 0.506696343421936, "kl": 0.0, "learning_rate": 4.447895100069013e-07, "loss": -0.0017, "num_tokens": 12953713.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 371 }, { "completion_length": 1875.416748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5215.0, "completions/mean_length": 3522.666748046875, "completions/mean_terminated_length": 2500.5556640625, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.12618724559023067, "frac_reward_zero_std": 0.0, "grad_norm": 0.761985719203949, "kl": NaN, "learning_rate": 4.446169772256729e-07, "loss": -0.085, "num_tokens": 12991038.0, "reward": 0.4916667342185974, "reward_std": 0.4645467698574066, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 372 }, { "completion_length": 2674.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6334.0, "completions/mean_length": 3772.58349609375, "completions/mean_terminated_length": 3209.300048828125, "completions/min_length": 1402.0, "completions/min_terminated_length": 1402.0, "epoch": 0.12652645861601086, "frac_reward_zero_std": 0.5, "grad_norm": 0.10343494266271591, "kl": NaN, "learning_rate": 4.444444444444444e-07, "loss": -0.0178, "num_tokens": 13036073.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 373 }, { "completion_length": 2471.08349609375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4921.0, "completions/mean_length": 3569.25, "completions/mean_terminated_length": 2965.300048828125, "completions/min_length": 1494.0, "completions/min_terminated_length": 1494.0, "epoch": 0.12686567164179105, "frac_reward_zero_std": 0.0, "grad_norm": 0.7868385910987854, "kl": NaN, "learning_rate": 4.44271911663216e-07, "loss": -0.0781, "num_tokens": 13080804.0, "reward": 0.783333420753479, "reward_std": 0.4425841271877289, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 374 }, { "completion_length": 1423.1666870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 4723.0, "completions/mean_length": 4717.6669921875, "completions/mean_terminated_length": 2846.33349609375, "completions/min_length": 1608.0, "completions/min_terminated_length": 1608.0, "epoch": 0.12720488466757124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.4409937888198754e-07, "loss": 0.0, "num_tokens": 13109138.0, "reward": 0.6499999761581421, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 375 }, { "completion_length": 2622.75, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4907.0, "completions/mean_length": 3171.83349609375, "completions/mean_terminated_length": 2861.181884765625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.12754409769335143, "frac_reward_zero_std": 0.0, "grad_norm": 0.6112473607063293, "kl": NaN, "learning_rate": 4.4392684610075915e-07, "loss": -0.0348, "num_tokens": 13149719.0, "reward": 0.9750000834465027, "reward_std": 0.4121825695037842, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 376 }, { "completion_length": 2402.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5788.0, "completions/mean_length": 2951.08349609375, "completions/mean_terminated_length": 2620.36376953125, "completions/min_length": 1429.0, "completions/min_terminated_length": 1429.0, "epoch": 0.12788331071913162, "frac_reward_zero_std": 0.0, "grad_norm": 0.2551642656326294, "kl": NaN, "learning_rate": 4.4375431331953065e-07, "loss": -0.0368, "num_tokens": 13188329.0, "reward": 1.0750000476837158, "reward_std": 0.2761763334274292, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 377 }, { "completion_length": 2022.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6286.0, "completions/mean_length": 2571.08349609375, "completions/mean_terminated_length": 2205.818359375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.1282225237449118, "frac_reward_zero_std": 0.0, "grad_norm": 0.7428491711616516, "kl": NaN, "learning_rate": 4.4358178053830225e-07, "loss": -0.0548, "num_tokens": 13223171.0, "reward": 0.6083333492279053, "reward_std": 0.2925342321395874, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.10112998634576797, "step": 378 }, { "completion_length": 877.5000457763672, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4776.0, "completions/mean_length": 3073.83349609375, "completions/mean_terminated_length": 1316.25, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.128561736770692, "frac_reward_zero_std": 0.5, "grad_norm": 0.17597438395023346, "kl": NaN, "learning_rate": 4.4340924775707386e-07, "loss": -0.019, "num_tokens": 13243979.0, "reward": 0.699999988079071, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 379 }, { "completion_length": 1323.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1323.166748046875, "completions/mean_terminated_length": 1323.166748046875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.12890094979647218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.432367149758454e-07, "loss": 0.0, "num_tokens": 13267591.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 380 }, { "completion_length": 1451.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3437.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 1451.8333740234375, "completions/mean_terminated_length": 1451.8333740234375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.12924016282225237, "frac_reward_zero_std": 0.5, "grad_norm": 0.7834616899490356, "kl": 0.0, "learning_rate": 4.4306418219461696e-07, "loss": -0.0123, "num_tokens": 13297223.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 381 }, { "completion_length": 1094.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 1094.0833740234375, "completions/mean_terminated_length": 1094.0833740234375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.12957937584803256, "frac_reward_zero_std": 1.0, "grad_norm": 2.6127665364583663e-07, "kl": 0.0, "learning_rate": 4.428916494133885e-07, "loss": 0.0, "num_tokens": 13324314.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 382 }, { "completion_length": 2174.916748046875, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 5623.0, "completions/mean_length": 5469.4169921875, "completions/mean_terminated_length": 4349.83349609375, "completions/min_length": 3537.0, "completions/min_terminated_length": 3537.0, "epoch": 0.12991858887381275, "frac_reward_zero_std": 0.5, "grad_norm": 0.7466797232627869, "kl": NaN, "learning_rate": 4.427191166321601e-07, "loss": 0.0017, "num_tokens": 13360529.0, "reward": 0.3999999761581421, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 383 }, { "completion_length": 2885.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5507.0, "completions/max_terminated_length": 5507.0, "completions/mean_length": 2885.0, "completions/mean_terminated_length": 2885.0, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.13025780189959293, "frac_reward_zero_std": 1.0, "grad_norm": 9.003652934325146e-08, "kl": 0.0, "learning_rate": 4.425465838509316e-07, "loss": 0.0, "num_tokens": 13409909.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 384 }, { "completion_length": 1805.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3747.0, "completions/max_terminated_length": 3747.0, "completions/mean_length": 1805.916748046875, "completions/mean_terminated_length": 1805.916748046875, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.13059701492537312, "frac_reward_zero_std": 0.5, "grad_norm": 0.4168512523174286, "kl": 0.0, "learning_rate": 4.4237405106970323e-07, "loss": -0.0029, "num_tokens": 13443088.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 385 }, { "completion_length": 1178.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 1178.0, "completions/mean_terminated_length": 1178.0, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.1309362279511533, "frac_reward_zero_std": 0.0, "grad_norm": 0.14581476151943207, "kl": 0.0, "learning_rate": 4.422015182884748e-07, "loss": -0.0007, "num_tokens": 13469020.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 386 }, { "completion_length": 1169.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1169.8333740234375, "completions/mean_terminated_length": 1169.8333740234375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.13127544097693353, "frac_reward_zero_std": 0.5, "grad_norm": 0.0867735743522644, "kl": 0.0, "learning_rate": 4.420289855072464e-07, "loss": -0.0019, "num_tokens": 13495550.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 387 }, { "completion_length": 1863.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5399.0, "completions/max_terminated_length": 5399.0, "completions/mean_length": 1863.166748046875, "completions/mean_terminated_length": 1863.166748046875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.13161465400271372, "frac_reward_zero_std": 0.5, "grad_norm": 0.49067234992980957, "kl": 0.0, "learning_rate": 4.418564527260179e-07, "loss": 0.0198, "num_tokens": 13532824.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 388 }, { "completion_length": 1987.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4437.0, "completions/max_terminated_length": 4437.0, "completions/mean_length": 1987.416748046875, "completions/mean_terminated_length": 1987.416748046875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.1319538670284939, "frac_reward_zero_std": 0.0, "grad_norm": 0.6929435133934021, "kl": 0.0, "learning_rate": 4.416839199447895e-07, "loss": -0.0, "num_tokens": 13566951.0, "reward": 0.7750000953674316, "reward_std": 0.2602938413619995, "rewards/correctness_reward_func/mean": 0.4999999701976776, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 389 }, { "completion_length": 1799.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 1799.5, "completions/mean_terminated_length": 1799.5, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.1322930800542741, "frac_reward_zero_std": 0.5, "grad_norm": 0.12015525251626968, "kl": 0.0, "learning_rate": 4.4151138716356105e-07, "loss": 0.0012, "num_tokens": 13601895.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 390 }, { "completion_length": 1699.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4197.0, "completions/max_terminated_length": 4197.0, "completions/mean_length": 1699.416748046875, "completions/mean_terminated_length": 1699.416748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.13263229308005428, "frac_reward_zero_std": 0.5, "grad_norm": 0.39857861399650574, "kl": 0.0, "learning_rate": 4.4133885438233265e-07, "loss": -0.0103, "num_tokens": 13631660.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 391 }, { "completion_length": 1202.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1202.5833740234375, "completions/mean_terminated_length": 1202.5833740234375, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.13297150610583447, "frac_reward_zero_std": 0.0, "grad_norm": 0.3004428446292877, "kl": 0.0, "learning_rate": 4.4116632160110415e-07, "loss": 0.0008, "num_tokens": 13657545.0, "reward": 1.0375001430511475, "reward_std": 0.23474279046058655, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 392 }, { "completion_length": 3542.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5556.0, "completions/mean_length": 4091.25, "completions/mean_terminated_length": 3864.181884765625, "completions/min_length": 2148.0, "completions/min_terminated_length": 2148.0, "epoch": 0.13331071913161466, "frac_reward_zero_std": 0.0, "grad_norm": 0.1583491563796997, "kl": NaN, "learning_rate": 4.4099378881987576e-07, "loss": -0.0071, "num_tokens": 13712039.0, "reward": 0.21250002086162567, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 393 }, { "completion_length": 1208.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 1208.5, "completions/mean_terminated_length": 1208.5, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.13364993215739485, "frac_reward_zero_std": 0.5, "grad_norm": 0.10130270570516586, "kl": 0.0, "learning_rate": 4.4082125603864736e-07, "loss": 0.0026, "num_tokens": 13737101.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 394 }, { "completion_length": 2585.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5495.0, "completions/mean_length": 3134.08349609375, "completions/mean_terminated_length": 2820.0, "completions/min_length": 1315.0, "completions/min_terminated_length": 1315.0, "epoch": 0.13398914518317503, "frac_reward_zero_std": 0.0, "grad_norm": 0.608137845993042, "kl": NaN, "learning_rate": 4.4064872325741886e-07, "loss": -0.0506, "num_tokens": 13782419.0, "reward": 1.058333396911621, "reward_std": 0.2877541482448578, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 395 }, { "completion_length": 3074.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5930.0, "completions/mean_length": 3623.25, "completions/mean_terminated_length": 3353.636474609375, "completions/min_length": 1629.0, "completions/min_terminated_length": 1629.0, "epoch": 0.13432835820895522, "frac_reward_zero_std": 0.5, "grad_norm": 0.5104953646659851, "kl": NaN, "learning_rate": 4.4047619047619047e-07, "loss": -0.0287, "num_tokens": 13833343.0, "reward": 0.9416667819023132, "reward_std": 0.24983328580856323, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 396 }, { "completion_length": 1857.0833740234375, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 6589.0, "completions/max_terminated_length": 5661.0, "completions/mean_length": 5700.6669921875, "completions/mean_terminated_length": 4457.0, "completions/min_length": 3203.0, "completions/min_terminated_length": 3203.0, "epoch": 0.1346675712347354, "frac_reward_zero_std": 0.0, "grad_norm": 0.1551879197359085, "kl": NaN, "learning_rate": 4.40303657694962e-07, "loss": -0.0141, "num_tokens": 13864922.0, "reward": 0.13750001788139343, "reward_std": 0.09185586869716644, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.13750000298023224, "rewards/format_reward_func/std": 0.14943073689937592, "step": 397 }, { "completion_length": 3534.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6351.0, "completions/mean_length": 4632.9169921875, "completions/mean_terminated_length": 4241.7001953125, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "epoch": 0.1350067842605156, "frac_reward_zero_std": 0.5, "grad_norm": 0.24700386822223663, "kl": NaN, "learning_rate": 4.4013112491373363e-07, "loss": -0.0244, "num_tokens": 13922897.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 398 }, { "completion_length": 997.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 997.1666870117188, "completions/mean_terminated_length": 997.1666870117188, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.1353459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.05267763510346413, "kl": 0.0, "learning_rate": 4.3995859213250513e-07, "loss": 0.0, "num_tokens": 13944649.0, "reward": 1.1875, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 399 }, { "completion_length": 2047.666748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6290.0, "completions/mean_length": 4244.0, "completions/mean_terminated_length": 3071.5, "completions/min_length": 1685.0, "completions/min_terminated_length": 1685.0, "epoch": 0.13568521031207598, "frac_reward_zero_std": 0.5, "grad_norm": 0.15240563452243805, "kl": NaN, "learning_rate": 4.3978605935127673e-07, "loss": -0.0224, "num_tokens": 13983681.0, "reward": 0.6250001192092896, "reward_std": 0.06708204001188278, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.11965861916542053, "step": 400 }, { "completion_length": 1873.4166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6427.0, "completions/mean_length": 2422.5, "completions/mean_terminated_length": 2043.727294921875, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.13602442333785617, "frac_reward_zero_std": 0.5, "grad_norm": 0.0715797170996666, "kl": NaN, "learning_rate": 4.396135265700483e-07, "loss": -0.0117, "num_tokens": 14017424.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 401 }, { "completion_length": 2485.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3410.0, "completions/max_terminated_length": 3410.0, "completions/mean_length": 2485.58349609375, "completions/mean_terminated_length": 2485.58349609375, "completions/min_length": 1527.0, "completions/min_terminated_length": 1527.0, "epoch": 0.13636363636363635, "frac_reward_zero_std": 1.0, "grad_norm": 2.7927711698794155e-07, "kl": 0.0, "learning_rate": 4.394409937888199e-07, "loss": 0.0, "num_tokens": 14057679.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 402 }, { "completion_length": 979.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 979.1666870117188, "completions/mean_terminated_length": 979.1666870117188, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.13670284938941654, "frac_reward_zero_std": 0.0, "grad_norm": 0.12159288674592972, "kl": 0.0, "learning_rate": 4.392684610075914e-07, "loss": 0.0013, "num_tokens": 14083013.0, "reward": 1.254166603088379, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 403 }, { "completion_length": 784.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 784.6666870117188, "completions/mean_terminated_length": 784.6666870117188, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.13704206241519673, "frac_reward_zero_std": 0.5, "grad_norm": 0.06961729377508163, "kl": 0.0, "learning_rate": 4.39095928226363e-07, "loss": -0.0009, "num_tokens": 14105899.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 404 }, { "completion_length": 1378.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 1378.8333740234375, "completions/mean_terminated_length": 1378.8333740234375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.13738127544097695, "frac_reward_zero_std": 0.5, "grad_norm": 0.12275088578462601, "kl": 0.0, "learning_rate": 4.3892339544513455e-07, "loss": -0.0004, "num_tokens": 14135267.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 405 }, { "completion_length": 1384.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1384.666748046875, "completions/mean_terminated_length": 1384.666748046875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.13772048846675713, "frac_reward_zero_std": 1.0, "grad_norm": 1.1979372516179865e-07, "kl": 0.0, "learning_rate": 4.387508626639061e-07, "loss": 0.0, "num_tokens": 14159677.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 406 }, { "completion_length": 1979.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5850.0, "completions/max_terminated_length": 5850.0, "completions/mean_length": 1979.0833740234375, "completions/mean_terminated_length": 1979.0833740234375, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.13805970149253732, "frac_reward_zero_std": 0.5, "grad_norm": 0.5809157490730286, "kl": 0.0, "learning_rate": 4.3857832988267766e-07, "loss": 0.0523, "num_tokens": 14196080.0, "reward": 0.9666668176651001, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 407 }, { "completion_length": 2523.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4132.0, "completions/max_terminated_length": 4132.0, "completions/mean_length": 2523.916748046875, "completions/mean_terminated_length": 2523.916748046875, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "epoch": 0.1383989145183175, "frac_reward_zero_std": 0.5, "grad_norm": 0.13894988596439362, "kl": 0.0, "learning_rate": 4.3840579710144926e-07, "loss": -0.0037, "num_tokens": 14239801.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 408 }, { "completion_length": 1763.7500457763672, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6448.0, "completions/mean_length": 3411.0, "completions/mean_terminated_length": 2351.666748046875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.1387381275440977, "frac_reward_zero_std": 0.0, "grad_norm": 0.8226115107536316, "kl": NaN, "learning_rate": 4.3823326432022087e-07, "loss": -0.1241, "num_tokens": 14273902.0, "reward": 0.7458333969116211, "reward_std": 0.4951653480529785, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 409 }, { "completion_length": 1476.4166870117188, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5934.0, "completions/mean_length": 3123.666748046875, "completions/mean_terminated_length": 1968.5555419921875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.1390773405698779, "frac_reward_zero_std": 0.5, "grad_norm": 1.1162936687469482, "kl": NaN, "learning_rate": 4.3806073153899237e-07, "loss": -0.082, "num_tokens": 14304171.0, "reward": 0.7583334445953369, "reward_std": 0.27095508575439453, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 410 }, { "completion_length": 3505.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6032.0, "completions/mean_length": 4054.666748046875, "completions/mean_terminated_length": 3824.27294921875, "completions/min_length": 1969.0, "completions/min_terminated_length": 1969.0, "epoch": 0.13941655359565808, "frac_reward_zero_std": 0.5, "grad_norm": 0.07667145878076553, "kl": NaN, "learning_rate": 4.3788819875776397e-07, "loss": -0.015, "num_tokens": 14359846.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 411 }, { "completion_length": 1065.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1065.666748046875, "completions/mean_terminated_length": 1065.666748046875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.13975576662143827, "frac_reward_zero_std": 0.0, "grad_norm": 0.403994083404541, "kl": 0.0, "learning_rate": 4.377156659765355e-07, "loss": 0.0055, "num_tokens": 14381556.0, "reward": 0.9874999523162842, "reward_std": 0.2497076690196991, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 412 }, { "completion_length": 1031.8333740234375, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6426.0, "completions/mean_length": 3777.25, "completions/mean_terminated_length": 1768.857177734375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.14009497964721845, "frac_reward_zero_std": 0.0, "grad_norm": 0.6786950826644897, "kl": NaN, "learning_rate": 4.3754313319530713e-07, "loss": -0.0741, "num_tokens": 14406148.0, "reward": 0.7291666865348816, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 413 }, { "completion_length": 2563.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5917.0, "completions/max_terminated_length": 5917.0, "completions/mean_length": 2563.166748046875, "completions/mean_terminated_length": 2563.166748046875, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.14043419267299864, "frac_reward_zero_std": 0.5, "grad_norm": 0.6143797039985657, "kl": 0.0, "learning_rate": 4.3737060041407863e-07, "loss": 0.0111, "num_tokens": 14447670.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 414 }, { "completion_length": 909.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 909.0833740234375, "completions/mean_terminated_length": 909.0833740234375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.14077340569877883, "frac_reward_zero_std": 1.0, "grad_norm": 2.1539605654652405e-07, "kl": 0.0, "learning_rate": 4.3719806763285024e-07, "loss": 0.0, "num_tokens": 14472895.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 415 }, { "completion_length": 2452.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5666.0, "completions/max_terminated_length": 5666.0, "completions/mean_length": 2452.08349609375, "completions/mean_terminated_length": 2452.08349609375, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.14111261872455902, "frac_reward_zero_std": 0.0, "grad_norm": 0.4528708755970001, "kl": 0.0, "learning_rate": 4.370255348516218e-07, "loss": -0.0006, "num_tokens": 14513966.0, "reward": 0.9708334803581238, "reward_std": 0.2679903507232666, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.32427072525024414, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 416 }, { "completion_length": 695.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 695.9166870117188, "completions/mean_terminated_length": 695.9166870117188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.1414518317503392, "frac_reward_zero_std": 0.5, "grad_norm": 0.090449757874012, "kl": 0.0, "learning_rate": 4.3685300207039334e-07, "loss": 0.0007, "num_tokens": 14531941.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 417 }, { "completion_length": 2134.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5264.0, "completions/max_terminated_length": 5264.0, "completions/mean_length": 2134.666748046875, "completions/mean_terminated_length": 2134.666748046875, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.1417910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.5045371651649475, "kl": 0.0, "learning_rate": 4.366804692891649e-07, "loss": 0.0213, "num_tokens": 14567709.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 418 }, { "completion_length": 2361.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4302.0, "completions/max_terminated_length": 4302.0, "completions/mean_length": 2361.166748046875, "completions/mean_terminated_length": 2361.166748046875, "completions/min_length": 1277.0, "completions/min_terminated_length": 1277.0, "epoch": 0.14213025780189958, "frac_reward_zero_std": 0.5, "grad_norm": 0.11170516163110733, "kl": 0.0, "learning_rate": 4.365079365079365e-07, "loss": 0.0027, "num_tokens": 14606201.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 419 }, { "completion_length": 1313.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 1313.75, "completions/mean_terminated_length": 1313.75, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 0.14246947082767977, "frac_reward_zero_std": 1.0, "grad_norm": 2.5472624542999256e-07, "kl": 0.0, "learning_rate": 4.3633540372670805e-07, "loss": 0.0, "num_tokens": 14639696.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 420 }, { "completion_length": 3634.750244140625, "completions/clipped_ratio": 0.0, "completions/max_length": 5848.0, "completions/max_terminated_length": 5848.0, "completions/mean_length": 3634.75, "completions/mean_terminated_length": 3634.75, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.14280868385345996, "frac_reward_zero_std": 0.0, "grad_norm": 1.2318860292434692, "kl": 0.0, "learning_rate": 4.361628709454796e-07, "loss": -0.0021, "num_tokens": 14693759.0, "reward": 0.7583333849906921, "reward_std": 0.40680140256881714, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 421 }, { "completion_length": 2677.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6421.0, "completions/max_terminated_length": 6421.0, "completions/mean_length": 2677.75, "completions/mean_terminated_length": 2677.75, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "epoch": 0.14314789687924015, "frac_reward_zero_std": 0.5, "grad_norm": 0.6643301844596863, "kl": 0.0, "learning_rate": 4.3599033816425116e-07, "loss": -0.0141, "num_tokens": 14740298.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 422 }, { "completion_length": 1688.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 1688.0, "completions/mean_terminated_length": 1688.0, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.14348710990502037, "frac_reward_zero_std": 1.0, "grad_norm": 2.688114761895122e-07, "kl": 0.0, "learning_rate": 4.3581780538302277e-07, "loss": 0.0, "num_tokens": 14770574.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 423 }, { "completion_length": 1518.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1518.0, "completions/mean_terminated_length": 1518.0, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.14382632293080055, "frac_reward_zero_std": 0.5, "grad_norm": 0.07617669552564621, "kl": 0.0, "learning_rate": 4.356452726017943e-07, "loss": -0.0, "num_tokens": 14802632.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 424 }, { "completion_length": 2456.166748046875, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 6572.0, "completions/mean_length": 5750.6669921875, "completions/mean_terminated_length": 4912.33349609375, "completions/min_length": 1737.0, "completions/min_terminated_length": 1737.0, "epoch": 0.14416553595658074, "frac_reward_zero_std": 0.0, "grad_norm": 0.7163939476013184, "kl": NaN, "learning_rate": 4.3547273982056587e-07, "loss": -0.0691, "num_tokens": 14845864.0, "reward": 0.2958333492279053, "reward_std": 0.29602330923080444, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.3113996088504791, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 425 }, { "completion_length": 1030.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 1030.8333740234375, "completions/mean_terminated_length": 1030.8333740234375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.14450474898236093, "frac_reward_zero_std": 0.0, "grad_norm": 0.10956176370382309, "kl": 0.0, "learning_rate": 4.353002070393375e-07, "loss": 0.0006, "num_tokens": 14864090.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 426 }, { "completion_length": 919.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3516.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 919.3333740234375, "completions/mean_terminated_length": 919.3333740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.14484396200814112, "frac_reward_zero_std": 0.5, "grad_norm": 0.11441392451524734, "kl": 0.0, "learning_rate": 4.3512767425810903e-07, "loss": -0.0054, "num_tokens": 14889018.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 427 }, { "completion_length": 3217.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6538.0, "completions/max_terminated_length": 6538.0, "completions/mean_length": 3217.25, "completions/mean_terminated_length": 3217.25, "completions/min_length": 1158.0, "completions/min_terminated_length": 1158.0, "epoch": 0.1451831750339213, "frac_reward_zero_std": 0.5, "grad_norm": 0.20972996950149536, "kl": 0.0, "learning_rate": 4.349551414768806e-07, "loss": -0.0038, "num_tokens": 14938941.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 428 }, { "completion_length": 2134.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3320.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 2134.916748046875, "completions/mean_terminated_length": 2134.916748046875, "completions/min_length": 1154.0, "completions/min_terminated_length": 1154.0, "epoch": 0.1455223880597015, "frac_reward_zero_std": 0.5, "grad_norm": 0.10739568620920181, "kl": 0.0, "learning_rate": 4.3478260869565214e-07, "loss": 0.0037, "num_tokens": 14972300.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 429 }, { "completion_length": 1214.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3529.0, "completions/max_terminated_length": 3529.0, "completions/mean_length": 1214.75, "completions/mean_terminated_length": 1214.75, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.14586160108548168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3598671853542328, "kl": 0.0, "learning_rate": 4.3461007591442374e-07, "loss": 0.0009, "num_tokens": 15001169.0, "reward": 1.129166603088379, "reward_std": 0.261197566986084, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 430 }, { "completion_length": 1586.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3801.0, "completions/max_terminated_length": 3801.0, "completions/mean_length": 1586.666748046875, "completions/mean_terminated_length": 1586.666748046875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.14620081411126187, "frac_reward_zero_std": 1.0, "grad_norm": 1.477949922445987e-07, "kl": 0.0, "learning_rate": 4.344375431331953e-07, "loss": 0.0, "num_tokens": 15033133.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 431 }, { "completion_length": 859.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 859.0, "completions/mean_terminated_length": 859.0, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.14654002713704206, "frac_reward_zero_std": 0.5, "grad_norm": 0.039414018392562866, "kl": 0.0, "learning_rate": 4.3426501035196685e-07, "loss": -0.001, "num_tokens": 15053815.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 432 }, { "completion_length": 740.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 740.6666870117188, "completions/mean_terminated_length": 740.6666870117188, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.14687924016282225, "frac_reward_zero_std": 0.0, "grad_norm": 0.07417774200439453, "kl": 0.0, "learning_rate": 4.340924775707384e-07, "loss": -0.0005, "num_tokens": 15074877.0, "reward": 1.2708332538604736, "reward_std": 0.07144343107938766, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 433 }, { "completion_length": 3072.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4774.0, "completions/max_terminated_length": 4774.0, "completions/mean_length": 3072.166748046875, "completions/mean_terminated_length": 3072.166748046875, "completions/min_length": 1584.0, "completions/min_terminated_length": 1584.0, "epoch": 0.14721845318860244, "frac_reward_zero_std": 1.0, "grad_norm": 2.7396959012548905e-07, "kl": 0.0, "learning_rate": 4.3391994478951e-07, "loss": 0.0, "num_tokens": 15122993.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 434 }, { "completion_length": 1638.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 1638.3333740234375, "completions/mean_terminated_length": 1638.3333740234375, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.14755766621438263, "frac_reward_zero_std": 1.0, "grad_norm": 2.3176340846475796e-07, "kl": 0.0, "learning_rate": 4.3374741200828156e-07, "loss": 0.0, "num_tokens": 15153723.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 435 }, { "completion_length": 2581.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5435.0, "completions/max_terminated_length": 5435.0, "completions/mean_length": 2581.0, "completions/mean_terminated_length": 2581.0, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "epoch": 0.14789687924016282, "frac_reward_zero_std": 0.5, "grad_norm": 0.1409706026315689, "kl": 0.0, "learning_rate": 4.335748792270531e-07, "loss": -0.0058, "num_tokens": 15193863.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 436 }, { "completion_length": 3028.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5637.0, "completions/mean_length": 3577.666748046875, "completions/mean_terminated_length": 3303.9091796875, "completions/min_length": 1284.0, "completions/min_terminated_length": 1284.0, "epoch": 0.148236092265943, "frac_reward_zero_std": 0.0, "grad_norm": 0.1644480973482132, "kl": NaN, "learning_rate": 4.3340234644582466e-07, "loss": -0.0131, "num_tokens": 15246178.0, "reward": 0.7416666746139526, "reward_std": 0.1128769963979721, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 437 }, { "completion_length": 3253.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5697.0, "completions/max_terminated_length": 5697.0, "completions/mean_length": 3253.0, "completions/mean_terminated_length": 3253.0, "completions/min_length": 1295.0, "completions/min_terminated_length": 1295.0, "epoch": 0.1485753052917232, "frac_reward_zero_std": 0.5, "grad_norm": 0.13437378406524658, "kl": 0.0, "learning_rate": 4.3322981366459627e-07, "loss": 0.001, "num_tokens": 15299776.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 438 }, { "completion_length": 1089.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1089.75, "completions/mean_terminated_length": 1089.75, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.14891451831750338, "frac_reward_zero_std": 0.0, "grad_norm": 0.4965176284313202, "kl": 0.0, "learning_rate": 4.3305728088336777e-07, "loss": 0.0096, "num_tokens": 15323659.0, "reward": 1.1000001430511475, "reward_std": 0.23782965540885925, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 439 }, { "completion_length": 1152.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 1152.75, "completions/mean_terminated_length": 1152.75, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.14925373134328357, "frac_reward_zero_std": 0.5, "grad_norm": 0.04821237921714783, "kl": 0.0, "learning_rate": 4.328847481021394e-07, "loss": -0.0003, "num_tokens": 15347458.0, "reward": 1.1875, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 440 }, { "completion_length": 2592.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4795.0, "completions/max_terminated_length": 4795.0, "completions/mean_length": 2592.0, "completions/mean_terminated_length": 2592.0, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 0.14959294436906379, "frac_reward_zero_std": 1.0, "grad_norm": 2.51977240850465e-07, "kl": 0.0, "learning_rate": 4.32712215320911e-07, "loss": 0.0, "num_tokens": 15391168.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 441 }, { "completion_length": 2777.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4143.0, "completions/max_terminated_length": 4143.0, "completions/mean_length": 2777.666748046875, "completions/mean_terminated_length": 2777.666748046875, "completions/min_length": 1074.0, "completions/min_terminated_length": 1074.0, "epoch": 0.14993215739484397, "frac_reward_zero_std": 0.5, "grad_norm": 0.47685208916664124, "kl": 0.0, "learning_rate": 4.3253968253968253e-07, "loss": 0.0155, "num_tokens": 15437178.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 442 }, { "completion_length": 1874.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4018.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1874.3333740234375, "completions/mean_terminated_length": 1874.3333740234375, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.15027137042062416, "frac_reward_zero_std": 0.5, "grad_norm": 0.1033775582909584, "kl": 0.0, "learning_rate": 4.323671497584541e-07, "loss": -0.0001, "num_tokens": 15471610.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 443 }, { "completion_length": 1630.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4842.0, "completions/max_terminated_length": 4842.0, "completions/mean_length": 1630.0833740234375, "completions/mean_terminated_length": 1630.0833740234375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.15061058344640435, "frac_reward_zero_std": 0.5, "grad_norm": 0.08019285649061203, "kl": 0.0, "learning_rate": 4.3219461697722564e-07, "loss": -0.0026, "num_tokens": 15504059.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 444 }, { "completion_length": 1034.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 1034.25, "completions/mean_terminated_length": 1034.25, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.15094979647218454, "frac_reward_zero_std": 1.0, "grad_norm": 1.2533315896234853e-07, "kl": 0.0, "learning_rate": 4.3202208419599725e-07, "loss": 0.0, "num_tokens": 15531314.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 445 }, { "completion_length": 1931.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4648.0, "completions/max_terminated_length": 4648.0, "completions/mean_length": 1931.166748046875, "completions/mean_terminated_length": 1931.166748046875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.15128900949796473, "frac_reward_zero_std": 0.5, "grad_norm": 0.7577066421508789, "kl": 0.0, "learning_rate": 4.318495514147688e-07, "loss": 0.0376, "num_tokens": 15569032.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 446 }, { "completion_length": 2384.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4492.0, "completions/max_terminated_length": 4492.0, "completions/mean_length": 2384.5, "completions/mean_terminated_length": 2384.5, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.15162822252374492, "frac_reward_zero_std": 0.5, "grad_norm": 0.2805997133255005, "kl": 0.0, "learning_rate": 4.3167701863354035e-07, "loss": -0.0, "num_tokens": 15613108.0, "reward": 1.1375000476837158, "reward_std": 0.07373939454555511, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 447 }, { "completion_length": 429.8333435058594, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 3724.33349609375, "completions/mean_terminated_length": 859.6666870117188, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.1519674355495251, "frac_reward_zero_std": 1.0, "grad_norm": 1.0236065151048024e-07, "kl": NaN, "learning_rate": 4.315044858523119e-07, "loss": 0.0, "num_tokens": 15632792.0, "reward": 0.5500000715255737, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 448 }, { "completion_length": 2166.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5026.0, "completions/mean_length": 3264.83349609375, "completions/mean_terminated_length": 2600.0, "completions/min_length": 1176.0, "completions/min_terminated_length": 1176.0, "epoch": 0.1523066485753053, "frac_reward_zero_std": 0.5, "grad_norm": 0.06184069439768791, "kl": NaN, "learning_rate": 4.313319530710835e-07, "loss": -0.0104, "num_tokens": 15670522.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 449 }, { "completion_length": 2323.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6337.0, "completions/max_terminated_length": 6337.0, "completions/mean_length": 2323.5, "completions/mean_terminated_length": 2323.5, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.15264586160108548, "frac_reward_zero_std": 0.5, "grad_norm": 0.8851024508476257, "kl": 0.0, "learning_rate": 4.3115942028985506e-07, "loss": 0.0646, "num_tokens": 15709432.0, "reward": 1.0166667699813843, "reward_std": 0.24832776188850403, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 450 }, { "completion_length": 2351.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4691.0, "completions/max_terminated_length": 4691.0, "completions/mean_length": 2351.5, "completions/mean_terminated_length": 2351.5, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.15298507462686567, "frac_reward_zero_std": 1.0, "grad_norm": 2.1750710743617674e-07, "kl": 0.0, "learning_rate": 4.309868875086266e-07, "loss": 0.0, "num_tokens": 15750946.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 451 }, { "completion_length": 2530.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5296.0, "completions/max_terminated_length": 5296.0, "completions/mean_length": 2530.0, "completions/mean_terminated_length": 2530.0, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.15332428765264586, "frac_reward_zero_std": 0.0, "grad_norm": 0.1575542837381363, "kl": 0.0, "learning_rate": 4.3081435472739817e-07, "loss": 0.0015, "num_tokens": 15794548.0, "reward": 1.1541666984558105, "reward_std": 0.08225835859775543, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 452 }, { "completion_length": 1643.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2960.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 1643.3333740234375, "completions/mean_terminated_length": 1643.3333740234375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.15366350067842605, "frac_reward_zero_std": 1.0, "grad_norm": 1.257485422456739e-07, "kl": 0.0, "learning_rate": 4.306418219461698e-07, "loss": 0.0, "num_tokens": 15827666.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 453 }, { "completion_length": 1459.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 1459.916748046875, "completions/mean_terminated_length": 1459.916748046875, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.15400271370420623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.304692891649413e-07, "loss": 0.0, "num_tokens": 15854089.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 454 }, { "completion_length": 1496.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 1496.5, "completions/mean_terminated_length": 1496.5, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.15434192672998642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.302967563837129e-07, "loss": 0.0, "num_tokens": 15882739.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 455 }, { "completion_length": 1633.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4204.0, "completions/max_terminated_length": 4204.0, "completions/mean_length": 1633.3333740234375, "completions/mean_terminated_length": 1633.3333740234375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.1546811397557666, "frac_reward_zero_std": 0.5, "grad_norm": 0.5038626194000244, "kl": 0.0, "learning_rate": 4.301242236024845e-07, "loss": 0.0028, "num_tokens": 15915797.0, "reward": 1.066666603088379, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 456 }, { "completion_length": 2144.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4475.0, "completions/mean_length": 3242.58349609375, "completions/mean_terminated_length": 2573.300048828125, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.1550203527815468, "frac_reward_zero_std": 0.0, "grad_norm": 1.2132185697555542, "kl": NaN, "learning_rate": 4.2995169082125604e-07, "loss": -0.0736, "num_tokens": 15955018.0, "reward": 0.9791667461395264, "reward_std": 0.3328944146633148, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 457 }, { "completion_length": 2428.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3740.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 2428.75, "completions/mean_terminated_length": 2428.75, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 0.155359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.09832356125116348, "kl": 0.0, "learning_rate": 4.297791580400276e-07, "loss": 0.0012, "num_tokens": 15992965.0, "reward": 1.1875, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 458 }, { "completion_length": 1184.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1184.166748046875, "completions/mean_terminated_length": 1184.166748046875, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.1556987788331072, "frac_reward_zero_std": 0.5, "grad_norm": 0.07985512912273407, "kl": 0.0, "learning_rate": 4.2960662525879914e-07, "loss": 0.0001, "num_tokens": 16022223.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 459 }, { "completion_length": 2334.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5195.0, "completions/mean_length": 3432.916748046875, "completions/mean_terminated_length": 2801.699951171875, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "epoch": 0.1560379918588874, "frac_reward_zero_std": 0.5, "grad_norm": 0.36306536197662354, "kl": NaN, "learning_rate": 4.2943409247757075e-07, "loss": -0.0326, "num_tokens": 16063908.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 460 }, { "completion_length": 3764.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6420.0, "completions/mean_length": 4313.83349609375, "completions/mean_terminated_length": 4107.0, "completions/min_length": 2831.0, "completions/min_terminated_length": 2831.0, "epoch": 0.15637720488466758, "frac_reward_zero_std": 0.5, "grad_norm": 0.2848891615867615, "kl": NaN, "learning_rate": 4.292615596963423e-07, "loss": -0.0579, "num_tokens": 16124367.0, "reward": 0.6083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 461 }, { "completion_length": 1265.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1265.8333740234375, "completions/mean_terminated_length": 1265.8333740234375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.15671641791044777, "frac_reward_zero_std": 0.5, "grad_norm": 0.3097565770149231, "kl": 0.0, "learning_rate": 4.2908902691511386e-07, "loss": 0.0004, "num_tokens": 16151689.0, "reward": 0.5541666746139526, "reward_std": 0.19900795817375183, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 462 }, { "completion_length": 2514.33349609375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6211.0, "completions/mean_length": 3612.5, "completions/mean_terminated_length": 3017.199951171875, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.15705563093622796, "frac_reward_zero_std": 0.0, "grad_norm": 0.7741003632545471, "kl": NaN, "learning_rate": 4.289164941338854e-07, "loss": -0.045, "num_tokens": 16194395.0, "reward": 0.5, "reward_std": 0.35132092237472534, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 463 }, { "completion_length": 1660.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 1660.916748046875, "completions/mean_terminated_length": 1660.916748046875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.15739484396200815, "frac_reward_zero_std": 0.5, "grad_norm": 0.09117577224969864, "kl": 0.0, "learning_rate": 4.28743961352657e-07, "loss": -0.0022, "num_tokens": 16227238.0, "reward": 1.2708332538604736, "reward_std": 0.045871179550886154, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 464 }, { "completion_length": 2607.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6129.0, "completions/max_terminated_length": 6129.0, "completions/mean_length": 2607.5, "completions/mean_terminated_length": 2607.5, "completions/min_length": 1522.0, "completions/min_terminated_length": 1522.0, "epoch": 0.15773405698778833, "frac_reward_zero_std": 0.5, "grad_norm": 0.12152191996574402, "kl": 0.0, "learning_rate": 4.285714285714285e-07, "loss": -0.0006, "num_tokens": 16271164.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 465 }, { "completion_length": 2674.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5492.0, "completions/mean_length": 3223.25, "completions/mean_terminated_length": 2917.272705078125, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.15807327001356852, "frac_reward_zero_std": 0.0, "grad_norm": 0.6836945414543152, "kl": NaN, "learning_rate": 4.283988957902001e-07, "loss": -0.0139, "num_tokens": 16311852.0, "reward": 0.8583333492279053, "reward_std": 0.28804606199264526, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 466 }, { "completion_length": 616.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 616.0, "completions/mean_terminated_length": 616.0, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.1584124830393487, "frac_reward_zero_std": 0.5, "grad_norm": 0.06278982013463974, "kl": 0.0, "learning_rate": 4.2822636300897167e-07, "loss": 0.0002, "num_tokens": 16331100.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 467 }, { "completion_length": 1923.1666870117188, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6093.0, "completions/mean_length": 4668.58349609375, "completions/mean_terminated_length": 3296.857177734375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.1587516960651289, "frac_reward_zero_std": 0.0, "grad_norm": 0.7384070158004761, "kl": NaN, "learning_rate": 4.280538302277433e-07, "loss": 0.0206, "num_tokens": 16367228.0, "reward": 0.37916669249534607, "reward_std": 0.3163924813270569, "rewards/correctness_reward_func/mean": 0.21666665375232697, "rewards/correctness_reward_func/std": 0.39504510164260864, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 468 }, { "completion_length": 2877.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5040.0, "completions/mean_length": 3975.916748046875, "completions/mean_terminated_length": 3453.300048828125, "completions/min_length": 1252.0, "completions/min_terminated_length": 1252.0, "epoch": 0.1590909090909091, "frac_reward_zero_std": 0.0, "grad_norm": 0.8427397012710571, "kl": NaN, "learning_rate": 4.278812974465148e-07, "loss": -0.0179, "num_tokens": 16411697.0, "reward": 0.4624999761581421, "reward_std": 0.35593757033348083, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.11894422769546509, "step": 469 }, { "completion_length": 971.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 971.0, "completions/mean_terminated_length": 971.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.15943012211668928, "frac_reward_zero_std": 1.0, "grad_norm": 1.582651520948275e-07, "kl": 0.0, "learning_rate": 4.277087646652864e-07, "loss": 0.0, "num_tokens": 16437101.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 470 }, { "completion_length": 877.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 877.5833740234375, "completions/mean_terminated_length": 877.5833740234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.15976933514246947, "frac_reward_zero_std": 0.0, "grad_norm": 0.317609041929245, "kl": 0.0, "learning_rate": 4.2753623188405794e-07, "loss": -0.0005, "num_tokens": 16457586.0, "reward": 0.6791666746139526, "reward_std": 0.22598153352737427, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 471 }, { "completion_length": 1876.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3251.0, "completions/mean_length": 2425.25, "completions/mean_terminated_length": 2046.727294921875, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "epoch": 0.16010854816824965, "frac_reward_zero_std": 0.5, "grad_norm": 0.21075165271759033, "kl": NaN, "learning_rate": 4.2736369910282954e-07, "loss": -0.0226, "num_tokens": 16495232.0, "reward": 1.0250000953674316, "reward_std": 0.23611438274383545, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 472 }, { "completion_length": 2999.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5298.0, "completions/max_terminated_length": 5298.0, "completions/mean_length": 2999.33349609375, "completions/mean_terminated_length": 2999.33349609375, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 0.16044776119402984, "frac_reward_zero_std": 0.5, "grad_norm": 0.6389744877815247, "kl": 0.0, "learning_rate": 4.271911663216011e-07, "loss": -0.0009, "num_tokens": 16545672.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 473 }, { "completion_length": 1288.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3679.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 1288.75, "completions/mean_terminated_length": 1288.75, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.16078697421981003, "frac_reward_zero_std": 0.0, "grad_norm": 0.43943166732788086, "kl": 0.0, "learning_rate": 4.2701863354037265e-07, "loss": 0.0136, "num_tokens": 16575927.0, "reward": 0.6125000715255737, "reward_std": 0.21714738011360168, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 474 }, { "completion_length": 1136.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1136.25, "completions/mean_terminated_length": 1136.25, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.16112618724559022, "frac_reward_zero_std": 0.5, "grad_norm": 0.04957255721092224, "kl": 0.0, "learning_rate": 4.2684610075914425e-07, "loss": 0.0003, "num_tokens": 16598586.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 475 }, { "completion_length": 3438.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5689.0, "completions/mean_length": 5086.1669921875, "completions/mean_terminated_length": 4585.22216796875, "completions/min_length": 3284.0, "completions/min_terminated_length": 3284.0, "epoch": 0.1614654002713704, "frac_reward_zero_std": 0.5, "grad_norm": 0.16727101802825928, "kl": NaN, "learning_rate": 4.2667356797791575e-07, "loss": -0.0145, "num_tokens": 16649081.0, "reward": 0.1875, "reward_std": 0.041079193353652954, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.12990382313728333, "step": 476 }, { "completion_length": 2295.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5441.0, "completions/max_terminated_length": 5441.0, "completions/mean_length": 2295.25, "completions/mean_terminated_length": 2295.25, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.16180461329715062, "frac_reward_zero_std": 0.5, "grad_norm": 0.6933732032775879, "kl": 0.0, "learning_rate": 4.2650103519668736e-07, "loss": 0.019, "num_tokens": 16686392.0, "reward": 1.0833334922790527, "reward_std": 0.19407901167869568, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 477 }, { "completion_length": 2071.3333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5041.0, "completions/mean_length": 3718.58349609375, "completions/mean_terminated_length": 2761.77783203125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 0.1621438263229308, "frac_reward_zero_std": 0.0, "grad_norm": 0.9527459740638733, "kl": NaN, "learning_rate": 4.263285024154589e-07, "loss": -0.0759, "num_tokens": 16724532.0, "reward": 0.7916667461395264, "reward_std": 0.31841057538986206, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.11965861171483994, "step": 478 }, { "completion_length": 2586.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6358.0, "completions/mean_length": 3135.08349609375, "completions/mean_terminated_length": 2821.091064453125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.162483039348711, "frac_reward_zero_std": 0.5, "grad_norm": 0.6366803646087646, "kl": NaN, "learning_rate": 4.261559696342305e-07, "loss": -0.0671, "num_tokens": 16771362.0, "reward": 1.191666603088379, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 479 }, { "completion_length": 712.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 712.5, "completions/mean_terminated_length": 712.5, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.1628222523744912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.25983436853002e-07, "loss": 0.0, "num_tokens": 16790934.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 480 }, { "completion_length": 1154.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1154.666748046875, "completions/mean_terminated_length": 1154.666748046875, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.16316146540027138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.258109040717736e-07, "loss": 0.0, "num_tokens": 16819826.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 481 }, { "completion_length": 3743.750244140625, "completions/clipped_ratio": 0.0, "completions/max_length": 6112.0, "completions/max_terminated_length": 6112.0, "completions/mean_length": 3743.75, "completions/mean_terminated_length": 3743.75, "completions/min_length": 1972.0, "completions/min_terminated_length": 1972.0, "epoch": 0.16350067842605157, "frac_reward_zero_std": 1.0, "grad_norm": 4.1540701545272896e-07, "kl": 0.0, "learning_rate": 4.256383712905452e-07, "loss": 0.0, "num_tokens": 16876589.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 482 }, { "completion_length": 2762.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5686.0, "completions/max_terminated_length": 5686.0, "completions/mean_length": 2762.916748046875, "completions/mean_terminated_length": 2762.916748046875, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.16383989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.254658385093168e-07, "loss": 0.0, "num_tokens": 16923526.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 483 }, { "completion_length": 2833.1666870117188, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6037.0, "completions/mean_length": 5029.5, "completions/mean_terminated_length": 4249.75, "completions/min_length": 2548.0, "completions/min_terminated_length": 2548.0, "epoch": 0.16417910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.6901265978813171, "kl": NaN, "learning_rate": 4.252933057280883e-07, "loss": -0.0956, "num_tokens": 16970076.0, "reward": 0.7333334684371948, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 484 }, { "completion_length": 1643.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 1643.666748046875, "completions/mean_terminated_length": 1643.666748046875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.16451831750339213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.251207729468599e-07, "loss": 0.0, "num_tokens": 17005076.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 485 }, { "completion_length": 1820.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3135.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 1820.8333740234375, "completions/mean_terminated_length": 1820.8333740234375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.16485753052917232, "frac_reward_zero_std": 0.0, "grad_norm": 0.6359934210777283, "kl": 0.0, "learning_rate": 4.2494824016563144e-07, "loss": 0.0045, "num_tokens": 17038380.0, "reward": 1.004166841506958, "reward_std": 0.28193777799606323, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 486 }, { "completion_length": 1532.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3784.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 1532.416748046875, "completions/mean_terminated_length": 1532.416748046875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.1651967435549525, "frac_reward_zero_std": 0.5, "grad_norm": 0.5672784447669983, "kl": 0.0, "learning_rate": 4.24775707384403e-07, "loss": 0.0126, "num_tokens": 17068793.0, "reward": 0.6499999761581421, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 487 }, { "completion_length": 1676.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 1676.166748046875, "completions/mean_terminated_length": 1676.166748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.1655359565807327, "frac_reward_zero_std": 0.0, "grad_norm": 0.6939499378204346, "kl": 0.0, "learning_rate": 4.246031746031746e-07, "loss": -0.0032, "num_tokens": 17098855.0, "reward": 0.5708333253860474, "reward_std": 0.43851161003112793, "rewards/correctness_reward_func/mean": 0.28333333134651184, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 488 }, { "completion_length": 608.5, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 608.5, "completions/mean_terminated_length": 608.5, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.16587516960651288, "frac_reward_zero_std": 1.0, "grad_norm": 9.13784745648627e-08, "kl": 0.0, "learning_rate": 4.2443064182194615e-07, "loss": 0.0, "num_tokens": 17118703.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 489 }, { "completion_length": 1851.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4297.0, "completions/max_terminated_length": 4297.0, "completions/mean_length": 1851.8333740234375, "completions/mean_terminated_length": 1851.8333740234375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.16621438263229307, "frac_reward_zero_std": 0.0, "grad_norm": 0.16395634412765503, "kl": 0.0, "learning_rate": 4.2425810904071776e-07, "loss": -0.0026, "num_tokens": 17150327.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 490 }, { "completion_length": 1038.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2055.0, "completions/max_terminated_length": 2055.0, "completions/mean_length": 1038.916748046875, "completions/mean_terminated_length": 1038.916748046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.16655359565807326, "frac_reward_zero_std": 0.0, "grad_norm": 0.10568089783191681, "kl": 0.0, "learning_rate": 4.2408557625948926e-07, "loss": -0.0016, "num_tokens": 17176492.0, "reward": 0.7375000715255737, "reward_std": 0.0853908583521843, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 491 }, { "completion_length": 2283.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6078.0, "completions/mean_length": 2832.5, "completions/mean_terminated_length": 2491.0, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.16689280868385345, "frac_reward_zero_std": 0.5, "grad_norm": 0.4687753915786743, "kl": NaN, "learning_rate": 4.2391304347826086e-07, "loss": -0.0141, "num_tokens": 17219097.0, "reward": 0.8083333969116211, "reward_std": 0.23327383399009705, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 492 }, { "completion_length": 2379.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5512.0, "completions/max_terminated_length": 5512.0, "completions/mean_length": 2379.666748046875, "completions/mean_terminated_length": 2379.666748046875, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.16723202170963364, "frac_reward_zero_std": 0.0, "grad_norm": 0.5325803756713867, "kl": 0.0, "learning_rate": 4.237405106970324e-07, "loss": 0.0065, "num_tokens": 17260247.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 493 }, { "completion_length": 2168.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3947.0, "completions/max_terminated_length": 3947.0, "completions/mean_length": 2168.666748046875, "completions/mean_terminated_length": 2168.666748046875, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "epoch": 0.16757123473541383, "frac_reward_zero_std": 1.0, "grad_norm": 1.7554351927628886e-07, "kl": 0.0, "learning_rate": 4.23567977915804e-07, "loss": 0.0, "num_tokens": 17298409.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 494 }, { "completion_length": 1945.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4646.0, "completions/max_terminated_length": 4646.0, "completions/mean_length": 1945.0, "completions/mean_terminated_length": 1945.0, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.16791044776119404, "frac_reward_zero_std": 1.0, "grad_norm": 2.706338761981897e-07, "kl": 0.0, "learning_rate": 4.233954451345755e-07, "loss": 0.0, "num_tokens": 17336221.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 495 }, { "completion_length": 1856.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4646.0, "completions/max_terminated_length": 4646.0, "completions/mean_length": 1856.25, "completions/mean_terminated_length": 1856.25, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.16824966078697423, "frac_reward_zero_std": 0.0, "grad_norm": 0.0943993553519249, "kl": 0.0, "learning_rate": 4.2322291235334713e-07, "loss": -0.0045, "num_tokens": 17374858.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 496 }, { "completion_length": 1558.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3671.0, "completions/max_terminated_length": 3671.0, "completions/mean_length": 1558.75, "completions/mean_terminated_length": 1558.75, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.16858887381275442, "frac_reward_zero_std": 1.0, "grad_norm": 1.6221547127770464e-07, "kl": 0.0, "learning_rate": 4.230503795721187e-07, "loss": 0.0, "num_tokens": 17402695.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 497 }, { "completion_length": 3426.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6069.0, "completions/max_terminated_length": 6069.0, "completions/mean_length": 3426.666748046875, "completions/mean_terminated_length": 3426.666748046875, "completions/min_length": 1866.0, "completions/min_terminated_length": 1866.0, "epoch": 0.1689280868385346, "frac_reward_zero_std": 0.0, "grad_norm": 0.6243091821670532, "kl": 0.0, "learning_rate": 4.2287784679089023e-07, "loss": 0.0319, "num_tokens": 17453313.0, "reward": 0.4541667103767395, "reward_std": 0.28881752490997314, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 498 }, { "completion_length": 1164.6667175292969, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 4459.1669921875, "completions/mean_terminated_length": 2329.33349609375, "completions/min_length": 1407.0, "completions/min_terminated_length": 1407.0, "epoch": 0.1692672998643148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.227053140096618e-07, "loss": 0.0, "num_tokens": 17482115.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 499 }, { "completion_length": 1079.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 1079.5, "completions/mean_terminated_length": 1079.5, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.16960651289009498, "frac_reward_zero_std": 1.0, "grad_norm": 9.258025102099054e-08, "kl": 0.0, "learning_rate": 4.225327812284334e-07, "loss": 0.0, "num_tokens": 17506619.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 500 }, { "completion_length": 1423.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 1423.0833740234375, "completions/mean_terminated_length": 1423.0833740234375, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 0.16994572591587517, "frac_reward_zero_std": 0.5, "grad_norm": 0.1390148103237152, "kl": 0.0, "learning_rate": 4.2236024844720495e-07, "loss": 0.0015, "num_tokens": 17536302.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 501 }, { "completion_length": 1925.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3312.0, "completions/max_terminated_length": 3312.0, "completions/mean_length": 1925.916748046875, "completions/mean_terminated_length": 1925.916748046875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "epoch": 0.17028493894165536, "frac_reward_zero_std": 1.0, "grad_norm": 3.353748923018429e-07, "kl": 0.0, "learning_rate": 4.221877156659765e-07, "loss": 0.0, "num_tokens": 17568329.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 502 }, { "completion_length": 2333.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4467.0, "completions/max_terminated_length": 4467.0, "completions/mean_length": 2333.58349609375, "completions/mean_terminated_length": 2333.58349609375, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.17062415196743555, "frac_reward_zero_std": 0.5, "grad_norm": 0.10262121260166168, "kl": 0.0, "learning_rate": 4.220151828847481e-07, "loss": -0.0002, "num_tokens": 17607348.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 503 }, { "completion_length": 2920.8334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6092.0, "completions/mean_length": 3469.916748046875, "completions/mean_terminated_length": 3186.36376953125, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.17096336499321574, "frac_reward_zero_std": 0.0, "grad_norm": 0.6888859868049622, "kl": NaN, "learning_rate": 4.2184265010351966e-07, "loss": -0.0503, "num_tokens": 17652208.0, "reward": 1.0833333730697632, "reward_std": 0.22770795226097107, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 504 }, { "completion_length": 2582.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4693.0, "completions/max_terminated_length": 4693.0, "completions/mean_length": 2582.166748046875, "completions/mean_terminated_length": 2582.166748046875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.17130257801899593, "frac_reward_zero_std": 0.5, "grad_norm": 0.13936969637870789, "kl": 0.0, "learning_rate": 4.2167011732229126e-07, "loss": -0.0029, "num_tokens": 17696994.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 505 }, { "completion_length": 1910.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4204.0, "completions/max_terminated_length": 4204.0, "completions/mean_length": 1910.166748046875, "completions/mean_terminated_length": 1910.166748046875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.17164179104477612, "frac_reward_zero_std": 0.5, "grad_norm": 0.7006736397743225, "kl": 0.0, "learning_rate": 4.2149758454106276e-07, "loss": 0.0117, "num_tokens": 17730788.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 506 }, { "completion_length": 1539.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3405.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 1539.416748046875, "completions/mean_terminated_length": 1539.416748046875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.1719810040705563, "frac_reward_zero_std": 0.5, "grad_norm": 0.11347401887178421, "kl": 0.0, "learning_rate": 4.2132505175983437e-07, "loss": -0.0027, "num_tokens": 17758231.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 507 }, { "completion_length": 1861.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4365.0, "completions/max_terminated_length": 4365.0, "completions/mean_length": 1861.0833740234375, "completions/mean_terminated_length": 1861.0833740234375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.1723202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.16115796566009521, "kl": 0.0, "learning_rate": 4.211525189786059e-07, "loss": 0.0001, "num_tokens": 17789888.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 508 }, { "completion_length": 1629.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3168.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 1629.8333740234375, "completions/mean_terminated_length": 1629.8333740234375, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.17265943012211668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.209799861973775e-07, "loss": 0.0, "num_tokens": 17820948.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 509 }, { "completion_length": 1754.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3233.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 1754.25, "completions/mean_terminated_length": 1754.25, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.17299864314789687, "frac_reward_zero_std": 0.5, "grad_norm": 0.33984920382499695, "kl": 0.0, "learning_rate": 4.2080745341614903e-07, "loss": 0.01, "num_tokens": 17851845.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 510 }, { "completion_length": 1409.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4162.0, "completions/max_terminated_length": 4162.0, "completions/mean_length": 1409.3333740234375, "completions/mean_terminated_length": 1409.3333740234375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.17333785617367706, "frac_reward_zero_std": 0.5, "grad_norm": 0.3745829463005066, "kl": 0.0, "learning_rate": 4.2063492063492063e-07, "loss": 0.0158, "num_tokens": 17880055.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 511 }, { "completion_length": 1085.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1085.916748046875, "completions/mean_terminated_length": 1085.916748046875, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.17367706919945725, "frac_reward_zero_std": 0.5, "grad_norm": 0.05689301714301109, "kl": 0.0, "learning_rate": 4.204623878536922e-07, "loss": -0.0001, "num_tokens": 17904756.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 512 }, { "completion_length": 2082.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3168.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 2082.83349609375, "completions/mean_terminated_length": 2082.83349609375, "completions/min_length": 1306.0, "completions/min_terminated_length": 1306.0, "epoch": 0.17401628222523746, "frac_reward_zero_std": 0.5, "grad_norm": 0.502258837223053, "kl": 0.0, "learning_rate": 4.2028985507246374e-07, "loss": 0.0137, "num_tokens": 17941312.0, "reward": 0.5666667222976685, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 513 }, { "completion_length": 1895.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 1895.916748046875, "completions/mean_terminated_length": 1895.916748046875, "completions/min_length": 1216.0, "completions/min_terminated_length": 1216.0, "epoch": 0.17435549525101765, "frac_reward_zero_std": 1.0, "grad_norm": 1.2023741646771668e-07, "kl": 0.0, "learning_rate": 4.201173222912353e-07, "loss": 0.0, "num_tokens": 17976897.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 514 }, { "completion_length": 3192.2501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6415.0, "completions/mean_length": 4290.4169921875, "completions/mean_terminated_length": 3830.699951171875, "completions/min_length": 1845.0, "completions/min_terminated_length": 1845.0, "epoch": 0.17469470827679784, "frac_reward_zero_std": 0.0, "grad_norm": 0.8532640933990479, "kl": NaN, "learning_rate": 4.199447895100069e-07, "loss": -0.0693, "num_tokens": 18027606.0, "reward": 0.9666666984558105, "reward_std": 0.3814123868942261, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 515 }, { "completion_length": 2581.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6207.0, "completions/max_terminated_length": 6207.0, "completions/mean_length": 2581.5, "completions/mean_terminated_length": 2581.5, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.17503392130257803, "frac_reward_zero_std": 0.0, "grad_norm": 0.8625640869140625, "kl": 0.0, "learning_rate": 4.1977225672877845e-07, "loss": 0.0173, "num_tokens": 18071898.0, "reward": 0.8166667222976685, "reward_std": 0.46741676330566406, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 516 }, { "completion_length": 1080.3333587646484, "completions/clipped_ratio": 0.0, "completions/max_length": 3990.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 1080.3333740234375, "completions/mean_terminated_length": 1080.3333740234375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.17537313432835822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1959972394755e-07, "loss": 0.0, "num_tokens": 18096424.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 517 }, { "completion_length": 830.0833587646484, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 4124.58349609375, "completions/mean_terminated_length": 1660.166748046875, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 0.1757123473541384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.1942719116632156e-07, "loss": 0.0, "num_tokens": 18123209.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 518 }, { "completion_length": 2415.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4186.0, "completions/max_terminated_length": 4186.0, "completions/mean_length": 2415.0, "completions/mean_terminated_length": 2415.0, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.1760515603799186, "frac_reward_zero_std": 1.0, "grad_norm": 1.772355489038091e-07, "kl": 0.0, "learning_rate": 4.1925465838509316e-07, "loss": 0.0, "num_tokens": 18164189.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 519 }, { "completion_length": 862.75, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 3059.08349609375, "completions/mean_terminated_length": 1294.125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.17639077340569878, "frac_reward_zero_std": 0.5, "grad_norm": 1.4193612337112427, "kl": NaN, "learning_rate": 4.190821256038647e-07, "loss": -0.0477, "num_tokens": 18186080.0, "reward": 0.7666667699813843, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 520 }, { "completion_length": 2900.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4391.0, "completions/max_terminated_length": 4391.0, "completions/mean_length": 2900.5, "completions/mean_terminated_length": 2900.5, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.17672998643147897, "frac_reward_zero_std": 1.0, "grad_norm": 4.6848953161315876e-07, "kl": 0.0, "learning_rate": 4.1890959282263627e-07, "loss": 0.0, "num_tokens": 18233546.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 521 }, { "completion_length": 3361.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5968.0, "completions/mean_length": 3910.916748046875, "completions/mean_terminated_length": 3667.45458984375, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.17706919945725916, "frac_reward_zero_std": 0.5, "grad_norm": 0.06511241942644119, "kl": NaN, "learning_rate": 4.1873706004140787e-07, "loss": -0.0128, "num_tokens": 18285468.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 522 }, { "completion_length": 1127.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3259.0, "completions/max_terminated_length": 3259.0, "completions/mean_length": 1127.75, "completions/mean_terminated_length": 1127.75, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.17740841248303935, "frac_reward_zero_std": 0.5, "grad_norm": 0.09431690722703934, "kl": 0.0, "learning_rate": 4.185645272601794e-07, "loss": 0.0045, "num_tokens": 18310161.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 523 }, { "completion_length": 2173.0834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5945.0, "completions/mean_length": 2722.166748046875, "completions/mean_terminated_length": 2370.636474609375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.17774762550881953, "frac_reward_zero_std": 0.0, "grad_norm": 0.13693396747112274, "kl": NaN, "learning_rate": 4.18391994478951e-07, "loss": -0.0113, "num_tokens": 18345670.0, "reward": 0.7250000834465027, "reward_std": 0.11600948870182037, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 524 }, { "completion_length": 1425.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 1425.166748046875, "completions/mean_terminated_length": 1425.166748046875, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.17808683853459972, "frac_reward_zero_std": 0.0, "grad_norm": 0.12271206825971603, "kl": 0.0, "learning_rate": 4.1821946169772253e-07, "loss": 0.0042, "num_tokens": 18373776.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333969116211, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 525 }, { "completion_length": 3609.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6538.0, "completions/mean_length": 4158.08349609375, "completions/mean_terminated_length": 3937.091064453125, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.1784260515603799, "frac_reward_zero_std": 0.5, "grad_norm": 1.16100013256073, "kl": NaN, "learning_rate": 4.1804692891649414e-07, "loss": -0.0366, "num_tokens": 18428916.0, "reward": 0.3583333194255829, "reward_std": 0.22453653812408447, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 526 }, { "completion_length": 2267.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4891.0, "completions/max_terminated_length": 4891.0, "completions/mean_length": 2267.666748046875, "completions/mean_terminated_length": 2267.666748046875, "completions/min_length": 1136.0, "completions/min_terminated_length": 1136.0, "epoch": 0.1787652645861601, "frac_reward_zero_std": 0.5, "grad_norm": 0.5539312958717346, "kl": 0.0, "learning_rate": 4.178743961352657e-07, "loss": 0.0149, "num_tokens": 18470840.0, "reward": 1.0500000715255737, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 527 }, { "completion_length": 666.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 666.25, "completions/mean_terminated_length": 666.25, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.1791044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1770186335403724e-07, "loss": 0.0, "num_tokens": 18494627.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 528 }, { "completion_length": 1161.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1161.5, "completions/mean_terminated_length": 1161.5, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.17944369063772048, "frac_reward_zero_std": 1.0, "grad_norm": 1.6599263119587704e-07, "kl": 0.0, "learning_rate": 4.175293305728088e-07, "loss": 0.0, "num_tokens": 18518861.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 529 }, { "completion_length": 921.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 921.0, "completions/mean_terminated_length": 921.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.17978290366350066, "frac_reward_zero_std": 0.5, "grad_norm": 0.20871922373771667, "kl": 0.0, "learning_rate": 4.173567977915804e-07, "loss": -0.0013, "num_tokens": 18541703.0, "reward": 0.3541666865348816, "reward_std": 0.17205862700939178, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 530 }, { "completion_length": 1214.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4228.0, "completions/max_terminated_length": 4228.0, "completions/mean_length": 1214.75, "completions/mean_terminated_length": 1214.75, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.18012211668928088, "frac_reward_zero_std": 0.5, "grad_norm": 0.06504378467798233, "kl": 0.0, "learning_rate": 4.171842650103519e-07, "loss": -0.0003, "num_tokens": 18563744.0, "reward": 0.7749999761581421, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 531 }, { "completion_length": 1132.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 1132.5, "completions/mean_terminated_length": 1132.5, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.18046132971506107, "frac_reward_zero_std": 1.0, "grad_norm": 2.329216783891752e-07, "kl": 0.0, "learning_rate": 4.170117322291235e-07, "loss": 0.0, "num_tokens": 18589352.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 532 }, { "completion_length": 701.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 701.6666870117188, "completions/mean_terminated_length": 701.6666870117188, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.18080054274084126, "frac_reward_zero_std": 0.5, "grad_norm": 0.12558390200138092, "kl": 0.0, "learning_rate": 4.1683919944789506e-07, "loss": -0.0, "num_tokens": 18610486.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 533 }, { "completion_length": 1081.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1081.666748046875, "completions/mean_terminated_length": 1081.666748046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.18113975576662145, "frac_reward_zero_std": 0.5, "grad_norm": 0.09250768274068832, "kl": 0.0, "learning_rate": 4.1666666666666667e-07, "loss": -0.0001, "num_tokens": 18636492.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 534 }, { "completion_length": 2280.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4509.0, "completions/max_terminated_length": 4509.0, "completions/mean_length": 2280.83349609375, "completions/mean_terminated_length": 2280.83349609375, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.18147896879240163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.164941338854382e-07, "loss": 0.0, "num_tokens": 18671170.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 535 }, { "completion_length": 1914.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4516.0, "completions/mean_length": 3013.0, "completions/mean_terminated_length": 2297.800048828125, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.18181818181818182, "frac_reward_zero_std": 0.0, "grad_norm": 0.438634991645813, "kl": NaN, "learning_rate": 4.1632160110420977e-07, "loss": -0.028, "num_tokens": 18707870.0, "reward": 1.0499999523162842, "reward_std": 0.3872982859611511, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 536 }, { "completion_length": 2529.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5630.0, "completions/mean_length": 4176.9169921875, "completions/mean_terminated_length": 3372.888916015625, "completions/min_length": 2035.0, "completions/min_terminated_length": 2035.0, "epoch": 0.182157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.38183891773223877, "kl": NaN, "learning_rate": 4.161490683229814e-07, "loss": -0.0232, "num_tokens": 18749434.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 537 }, { "completion_length": 2225.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4071.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 2225.33349609375, "completions/mean_terminated_length": 2225.33349609375, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.1824966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 1.5497386129936785e-07, "kl": 0.0, "learning_rate": 4.1597653554175293e-07, "loss": 0.0, "num_tokens": 18788006.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 538 }, { "completion_length": 1238.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3105.0, "completions/max_terminated_length": 3105.0, "completions/mean_length": 1238.0833740234375, "completions/mean_terminated_length": 1238.0833740234375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.1828358208955224, "frac_reward_zero_std": 1.0, "grad_norm": 2.6287909804523224e-07, "kl": 0.0, "learning_rate": 4.158040027605245e-07, "loss": 0.0, "num_tokens": 18813957.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 539 }, { "completion_length": 1780.666748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6236.0, "completions/mean_length": 3977.0, "completions/mean_terminated_length": 2671.0, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.18317503392130258, "frac_reward_zero_std": 0.0, "grad_norm": 0.5980116128921509, "kl": NaN, "learning_rate": 4.1563146997929604e-07, "loss": -0.0649, "num_tokens": 18847355.0, "reward": 0.7333333492279053, "reward_std": 0.27224498987197876, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 540 }, { "completion_length": 2010.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5844.0, "completions/mean_length": 3658.166748046875, "completions/mean_terminated_length": 2681.22216796875, "completions/min_length": 1292.0, "completions/min_terminated_length": 1292.0, "epoch": 0.18351424694708277, "frac_reward_zero_std": 0.0, "grad_norm": 4.284682750701904, "kl": NaN, "learning_rate": 4.1545893719806764e-07, "loss": -0.0252, "num_tokens": 18885052.0, "reward": 0.6750000715255737, "reward_std": 0.13693061470985413, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 541 }, { "completion_length": 2788.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5735.0, "completions/max_terminated_length": 5735.0, "completions/mean_length": 2788.08349609375, "completions/mean_terminated_length": 2788.08349609375, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.18385345997286295, "frac_reward_zero_std": 0.0, "grad_norm": 0.6677839756011963, "kl": 0.0, "learning_rate": 4.1528640441683914e-07, "loss": -0.0025, "num_tokens": 18932045.0, "reward": 1.066666841506958, "reward_std": 0.2168930023908615, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 542 }, { "completion_length": 1617.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4218.0, "completions/max_terminated_length": 4218.0, "completions/mean_length": 1617.0, "completions/mean_terminated_length": 1617.0, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.18419267299864314, "frac_reward_zero_std": 1.0, "grad_norm": 9.618871388283878e-08, "kl": 0.0, "learning_rate": 4.1511387163561075e-07, "loss": 0.0, "num_tokens": 18965921.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 543 }, { "completion_length": 1418.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1418.666748046875, "completions/mean_terminated_length": 1418.666748046875, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.18453188602442333, "frac_reward_zero_std": 0.5, "grad_norm": 0.2889324724674225, "kl": 0.0, "learning_rate": 4.149413388543823e-07, "loss": 0.0026, "num_tokens": 18996343.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 544 }, { "completion_length": 1246.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1246.3333740234375, "completions/mean_terminated_length": 1246.3333740234375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.18487109905020352, "frac_reward_zero_std": 0.5, "grad_norm": 0.43304693698883057, "kl": 0.0, "learning_rate": 4.147688060731539e-07, "loss": 0.0088, "num_tokens": 19020623.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 545 }, { "completion_length": 1579.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4176.0, "completions/max_terminated_length": 4176.0, "completions/mean_length": 1579.0, "completions/mean_terminated_length": 1579.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.1852103120759837, "frac_reward_zero_std": 1.0, "grad_norm": 2.227987891956218e-07, "kl": 0.0, "learning_rate": 4.145962732919254e-07, "loss": 0.0, "num_tokens": 19051523.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 546 }, { "completion_length": 2309.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6122.0, "completions/max_terminated_length": 6122.0, "completions/mean_length": 2309.0, "completions/mean_terminated_length": 2309.0, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.1855495251017639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.14423740510697e-07, "loss": 0.0, "num_tokens": 19088027.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 547 }, { "completion_length": 1715.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3856.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 1715.0, "completions/mean_terminated_length": 1715.0, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.18588873812754408, "frac_reward_zero_std": 0.0, "grad_norm": 0.35062411427497864, "kl": 0.0, "learning_rate": 4.1425120772946856e-07, "loss": -0.0015, "num_tokens": 19123619.0, "reward": 0.8833333849906921, "reward_std": 0.26133137941360474, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 548 }, { "completion_length": 1286.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 1835.916748046875, "completions/mean_terminated_length": 1403.8182373046875, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.1862279511533243, "frac_reward_zero_std": 0.0, "grad_norm": 0.7429186701774597, "kl": NaN, "learning_rate": 4.1407867494824017e-07, "loss": -0.0282, "num_tokens": 19155267.0, "reward": 0.8083333373069763, "reward_std": 0.5039968490600586, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 549 }, { "completion_length": 2776.08349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6261.0, "completions/mean_length": 3325.166748046875, "completions/mean_terminated_length": 3028.45458984375, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 0.1865671641791045, "frac_reward_zero_std": 0.5, "grad_norm": 0.08030923455953598, "kl": NaN, "learning_rate": 4.1390614216701167e-07, "loss": -0.0084, "num_tokens": 19195720.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 550 }, { "completion_length": 2092.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5555.0, "completions/mean_length": 2641.666748046875, "completions/mean_terminated_length": 2282.818359375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.18690637720488468, "frac_reward_zero_std": 0.5, "grad_norm": 0.49874621629714966, "kl": NaN, "learning_rate": 4.137336093857833e-07, "loss": -0.0328, "num_tokens": 19236557.0, "reward": 1.1000001430511475, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 551 }, { "completion_length": 2317.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6479.0, "completions/mean_length": 2866.666748046875, "completions/mean_terminated_length": 2528.272705078125, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.18724559023066487, "frac_reward_zero_std": 0.5, "grad_norm": 0.5327326059341431, "kl": NaN, "learning_rate": 4.135610766045549e-07, "loss": -0.0418, "num_tokens": 19275342.0, "reward": 0.6916667819023132, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 552 }, { "completion_length": 1518.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3452.0, "completions/max_terminated_length": 3452.0, "completions/mean_length": 1518.416748046875, "completions/mean_terminated_length": 1518.416748046875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.18758480325644505, "frac_reward_zero_std": 0.0, "grad_norm": 0.5784144401550293, "kl": 0.0, "learning_rate": 4.1338854382332643e-07, "loss": -0.0123, "num_tokens": 19305695.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 553 }, { "completion_length": 1451.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5332.0, "completions/mean_length": 2549.75, "completions/mean_terminated_length": 1741.9000244140625, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.18792401628222524, "frac_reward_zero_std": 0.0, "grad_norm": 0.5653297901153564, "kl": NaN, "learning_rate": 4.13216011042098e-07, "loss": -0.0303, "num_tokens": 19336656.0, "reward": 0.9125000834465027, "reward_std": 0.31922924518585205, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 554 }, { "completion_length": 3387.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5996.0, "completions/mean_length": 3936.666748046875, "completions/mean_terminated_length": 3695.545654296875, "completions/min_length": 2256.0, "completions/min_terminated_length": 2256.0, "epoch": 0.18826322930800543, "frac_reward_zero_std": 0.0, "grad_norm": 0.1428702175617218, "kl": NaN, "learning_rate": 4.1304347826086954e-07, "loss": -0.0036, "num_tokens": 19388881.0, "reward": 0.7541667819023132, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 555 }, { "completion_length": 723.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 723.4166870117188, "completions/mean_terminated_length": 723.4166870117188, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.18860244233378562, "frac_reward_zero_std": 1.0, "grad_norm": 1.9308046717014804e-07, "kl": 0.0, "learning_rate": 4.1287094547964115e-07, "loss": 0.0, "num_tokens": 19411362.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 556 }, { "completion_length": 995.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 995.0, "completions/mean_terminated_length": 995.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.1889416553595658, "frac_reward_zero_std": 1.0, "grad_norm": 3.172017670749483e-07, "kl": 0.0, "learning_rate": 4.1269841269841265e-07, "loss": 0.0, "num_tokens": 19436022.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 557 }, { "completion_length": 2856.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4397.0, "completions/max_terminated_length": 4397.0, "completions/mean_length": 2856.83349609375, "completions/mean_terminated_length": 2856.83349609375, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.189280868385346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1252587991718425e-07, "loss": 0.0, "num_tokens": 19481284.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 558 }, { "completion_length": 1833.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5297.0, "completions/max_terminated_length": 5297.0, "completions/mean_length": 1833.5, "completions/mean_terminated_length": 1833.5, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.18962008141112618, "frac_reward_zero_std": 0.5, "grad_norm": 0.08774584531784058, "kl": 0.0, "learning_rate": 4.123533471359558e-07, "loss": -0.0012, "num_tokens": 19518106.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 559 }, { "completion_length": 1456.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1456.75, "completions/mean_terminated_length": 1456.75, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.18995929443690637, "frac_reward_zero_std": 0.5, "grad_norm": 0.4802844822406769, "kl": 0.0, "learning_rate": 4.121808143547274e-07, "loss": 0.0104, "num_tokens": 19546603.0, "reward": 0.6666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.36666664481163025, "rewards/correctness_reward_func/std": 0.45792677998542786, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 560 }, { "completion_length": 1429.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 1429.5833740234375, "completions/mean_terminated_length": 1429.5833740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.19029850746268656, "frac_reward_zero_std": 0.0, "grad_norm": 0.36741065979003906, "kl": 0.0, "learning_rate": 4.120082815734989e-07, "loss": 0.0001, "num_tokens": 19574510.0, "reward": 0.595833420753479, "reward_std": 0.20437853038311005, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 561 }, { "completion_length": 1721.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3282.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 1721.0833740234375, "completions/mean_terminated_length": 1721.0833740234375, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.19063772048846675, "frac_reward_zero_std": 1.0, "grad_norm": 1.8861383921375818e-07, "kl": 0.0, "learning_rate": 4.118357487922705e-07, "loss": 0.0, "num_tokens": 19607967.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 562 }, { "completion_length": 1103.0000305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1652.0833740234375, "completions/mean_terminated_length": 1203.272705078125, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.19097693351424694, "frac_reward_zero_std": 0.5, "grad_norm": 0.17974478006362915, "kl": NaN, "learning_rate": 4.1166321601104207e-07, "loss": -0.0182, "num_tokens": 19638069.0, "reward": 1.058333396911621, "reward_std": 0.25380438566207886, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 563 }, { "completion_length": 660.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 660.5, "completions/mean_terminated_length": 660.5, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.19131614654002713, "frac_reward_zero_std": 1.0, "grad_norm": 1.9672070550313947e-07, "kl": 0.0, "learning_rate": 4.114906832298137e-07, "loss": 0.0, "num_tokens": 19662405.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 564 }, { "completion_length": 913.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2214.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 913.4166870117188, "completions/mean_terminated_length": 913.4166870117188, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.19165535956580732, "frac_reward_zero_std": 0.5, "grad_norm": 0.26728707551956177, "kl": 0.0, "learning_rate": 4.113181504485852e-07, "loss": -0.0042, "num_tokens": 19688930.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 565 }, { "completion_length": 621.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 621.0, "completions/mean_terminated_length": 621.0, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.1919945725915875, "frac_reward_zero_std": 1.0, "grad_norm": 8.458977163172676e-08, "kl": 0.0, "learning_rate": 4.111456176673568e-07, "loss": 0.0, "num_tokens": 19708724.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 566 }, { "completion_length": 2206.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4598.0, "completions/max_terminated_length": 4598.0, "completions/mean_length": 2206.0, "completions/mean_terminated_length": 2206.0, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.19233378561736772, "frac_reward_zero_std": 0.0, "grad_norm": 0.29058995842933655, "kl": 0.0, "learning_rate": 4.109730848861284e-07, "loss": -0.0083, "num_tokens": 19746638.0, "reward": 1.1166666746139526, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 567 }, { "completion_length": 1693.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 6508.0, "completions/max_terminated_length": 6508.0, "completions/mean_length": 1693.0833740234375, "completions/mean_terminated_length": 1693.0833740234375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.1926729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.44586482644081116, "kl": 0.0, "learning_rate": 4.108005521048999e-07, "loss": -0.0105, "num_tokens": 19778715.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 568 }, { "completion_length": 2577.2501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6125.0, "completions/mean_length": 3675.416748046875, "completions/mean_terminated_length": 3092.699951171875, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "epoch": 0.1930122116689281, "frac_reward_zero_std": 0.0, "grad_norm": 0.7111157774925232, "kl": NaN, "learning_rate": 4.106280193236715e-07, "loss": -0.0213, "num_tokens": 19822152.0, "reward": 0.44166669249534607, "reward_std": 0.31943613290786743, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.38924944400787354, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 569 }, { "completion_length": 1982.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 1982.3333740234375, "completions/mean_terminated_length": 1982.3333740234375, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 0.19335142469470828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1045548654244304e-07, "loss": 0.0, "num_tokens": 19858936.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 570 }, { "completion_length": 1354.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 1354.416748046875, "completions/mean_terminated_length": 1354.416748046875, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.19369063772048847, "frac_reward_zero_std": 1.0, "grad_norm": 1.0446248666085012e-07, "kl": 0.0, "learning_rate": 4.1028295376121465e-07, "loss": 0.0, "num_tokens": 19888419.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 571 }, { "completion_length": 2613.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4610.0, "completions/max_terminated_length": 4610.0, "completions/mean_length": 2613.5, "completions/mean_terminated_length": 2613.5, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.19402985074626866, "frac_reward_zero_std": 1.0, "grad_norm": 1.4929959490928013e-07, "kl": 0.0, "learning_rate": 4.1011042097998615e-07, "loss": 0.0, "num_tokens": 19931457.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 572 }, { "completion_length": 1108.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 1108.8333740234375, "completions/mean_terminated_length": 1108.8333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.19436906377204885, "frac_reward_zero_std": 0.5, "grad_norm": 0.05309825763106346, "kl": 0.0, "learning_rate": 4.0993788819875776e-07, "loss": -0.0001, "num_tokens": 19959157.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 573 }, { "completion_length": 811.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 811.0, "completions/mean_terminated_length": 811.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.19470827679782904, "frac_reward_zero_std": 0.5, "grad_norm": 0.2551961839199066, "kl": 0.0, "learning_rate": 4.097653554175293e-07, "loss": -0.001, "num_tokens": 19981459.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 574 }, { "completion_length": 1731.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3343.0, "completions/max_terminated_length": 3343.0, "completions/mean_length": 1731.666748046875, "completions/mean_terminated_length": 1731.666748046875, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.19504748982360923, "frac_reward_zero_std": 0.0, "grad_norm": 0.12224812805652618, "kl": 0.0, "learning_rate": 4.095928226363009e-07, "loss": 0.0014, "num_tokens": 20017251.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 575 }, { "completion_length": 2264.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4903.0, "completions/max_terminated_length": 4903.0, "completions/mean_length": 2264.916748046875, "completions/mean_terminated_length": 2264.916748046875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.19538670284938942, "frac_reward_zero_std": 0.5, "grad_norm": 0.07465171068906784, "kl": 0.0, "learning_rate": 4.094202898550724e-07, "loss": 0.0004, "num_tokens": 20059640.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 576 }, { "completion_length": 711.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 711.9166870117188, "completions/mean_terminated_length": 711.9166870117188, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.1957259158751696, "frac_reward_zero_std": 0.5, "grad_norm": 0.3539010286331177, "kl": 0.0, "learning_rate": 4.09247757073844e-07, "loss": -0.0026, "num_tokens": 20076679.0, "reward": 1.000000238418579, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 577 }, { "completion_length": 2636.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5236.0, "completions/max_terminated_length": 5236.0, "completions/mean_length": 2636.0, "completions/mean_terminated_length": 2636.0, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.1960651289009498, "frac_reward_zero_std": 0.0, "grad_norm": 1.1721926927566528, "kl": 0.0, "learning_rate": 4.0907522429261557e-07, "loss": 0.0262, "num_tokens": 20117005.0, "reward": 0.8041667342185974, "reward_std": 0.43039870262145996, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 578 }, { "completion_length": 1920.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4433.0, "completions/max_terminated_length": 4433.0, "completions/mean_length": 1920.5833740234375, "completions/mean_terminated_length": 1920.5833740234375, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.19640434192672998, "frac_reward_zero_std": 0.5, "grad_norm": 0.5124653577804565, "kl": 0.0, "learning_rate": 4.089026915113871e-07, "loss": -0.0106, "num_tokens": 20153522.0, "reward": 1.0833332538604736, "reward_std": 0.222860187292099, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 579 }, { "completion_length": 1807.25, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6471.0, "completions/mean_length": 3454.5, "completions/mean_terminated_length": 2409.666748046875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.19674355495251017, "frac_reward_zero_std": 0.5, "grad_norm": 0.17243024706840515, "kl": NaN, "learning_rate": 4.087301587301587e-07, "loss": -0.0296, "num_tokens": 20189837.0, "reward": 0.6125000715255737, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 580 }, { "completion_length": 2128.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4383.0, "completions/max_terminated_length": 4383.0, "completions/mean_length": 2128.25, "completions/mean_terminated_length": 2128.25, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.19708276797829036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.085576259489303e-07, "loss": 0.0, "num_tokens": 20230106.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 581 }, { "completion_length": 2194.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4675.0, "completions/max_terminated_length": 4675.0, "completions/mean_length": 2194.83349609375, "completions/mean_terminated_length": 2194.83349609375, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.19742198100407055, "frac_reward_zero_std": 0.0, "grad_norm": 0.6451249122619629, "kl": 0.0, "learning_rate": 4.083850931677019e-07, "loss": -0.0096, "num_tokens": 20267946.0, "reward": 1.070833444595337, "reward_std": 0.29760777950286865, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 582 }, { "completion_length": 1643.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3262.0, "completions/max_terminated_length": 3262.0, "completions/mean_length": 1643.916748046875, "completions/mean_terminated_length": 1643.916748046875, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.19776119402985073, "frac_reward_zero_std": 0.0, "grad_norm": 0.13853637874126434, "kl": 0.0, "learning_rate": 4.082125603864734e-07, "loss": -0.0048, "num_tokens": 20298029.0, "reward": 1.2208333015441895, "reward_std": 0.10064341127872467, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 583 }, { "completion_length": 2544.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5490.0, "completions/max_terminated_length": 5490.0, "completions/mean_length": 2544.166748046875, "completions/mean_terminated_length": 2544.166748046875, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.19810040705563092, "frac_reward_zero_std": 0.5, "grad_norm": 0.7164510488510132, "kl": 0.0, "learning_rate": 4.08040027605245e-07, "loss": 0.0183, "num_tokens": 20340841.0, "reward": 1.1166666746139526, "reward_std": 0.24832773208618164, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857302963733673, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 584 }, { "completion_length": 843.9166870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 4138.4169921875, "completions/mean_terminated_length": 1687.8333740234375, "completions/min_length": 1129.0, "completions/min_terminated_length": 1129.0, "epoch": 0.19843962008141114, "frac_reward_zero_std": 0.5, "grad_norm": 0.10080356895923615, "kl": NaN, "learning_rate": 4.0786749482401655e-07, "loss": -0.001, "num_tokens": 20363022.0, "reward": 0.5833333730697632, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 585 }, { "completion_length": 1471.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 1471.416748046875, "completions/mean_terminated_length": 1471.416748046875, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 0.19877883310719133, "frac_reward_zero_std": 0.5, "grad_norm": 0.07403448224067688, "kl": 0.0, "learning_rate": 4.0769496204278815e-07, "loss": -0.0001, "num_tokens": 20390729.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 586 }, { "completion_length": 1442.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1442.5833740234375, "completions/mean_terminated_length": 1442.5833740234375, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.19911804613297152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0752242926155965e-07, "loss": 0.0, "num_tokens": 20417646.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 587 }, { "completion_length": 1039.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1039.416748046875, "completions/mean_terminated_length": 1039.416748046875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.1994572591587517, "frac_reward_zero_std": 0.5, "grad_norm": 0.09593465924263, "kl": 0.0, "learning_rate": 4.0734989648033126e-07, "loss": -0.0003, "num_tokens": 20442431.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 588 }, { "completion_length": 3065.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6188.0, "completions/max_terminated_length": 6188.0, "completions/mean_length": 3065.416748046875, "completions/mean_terminated_length": 3065.416748046875, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 0.1997964721845319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.071773636991028e-07, "loss": 0.0, "num_tokens": 20492140.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 589 }, { "completion_length": 704.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 704.8333740234375, "completions/mean_terminated_length": 704.8333740234375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.20013568521031208, "frac_reward_zero_std": 1.0, "grad_norm": 2.2550463540937926e-07, "kl": 0.0, "learning_rate": 4.0700483091787437e-07, "loss": 0.0, "num_tokens": 20516426.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 590 }, { "completion_length": 1735.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3626.0, "completions/max_terminated_length": 3626.0, "completions/mean_length": 1735.8333740234375, "completions/mean_terminated_length": 1735.8333740234375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.20047489823609227, "frac_reward_zero_std": 0.5, "grad_norm": 0.45365244150161743, "kl": 0.0, "learning_rate": 4.068322981366459e-07, "loss": 0.0057, "num_tokens": 20547852.0, "reward": 0.5666667222976685, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 591 }, { "completion_length": 1736.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 2285.916748046875, "completions/mean_terminated_length": 1894.727294921875, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.20081411126187246, "frac_reward_zero_std": 0.0, "grad_norm": 0.2545448839664459, "kl": NaN, "learning_rate": 4.066597653554175e-07, "loss": -0.034, "num_tokens": 20578780.0, "reward": 1.0750000476837158, "reward_std": 0.29088661074638367, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 592 }, { "completion_length": 1546.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1546.916748046875, "completions/mean_terminated_length": 1546.916748046875, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.20115332428765265, "frac_reward_zero_std": 0.5, "grad_norm": 0.09835981577634811, "kl": 0.0, "learning_rate": 4.064872325741891e-07, "loss": -0.0017, "num_tokens": 20607141.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 593 }, { "completion_length": 2359.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4462.0, "completions/mean_length": 2908.166748046875, "completions/mean_terminated_length": 2573.54541015625, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 0.20149253731343283, "frac_reward_zero_std": 0.5, "grad_norm": 0.07446026802062988, "kl": NaN, "learning_rate": 4.0631469979296063e-07, "loss": -0.004, "num_tokens": 20643640.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 594 }, { "completion_length": 1952.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6144.0, "completions/mean_length": 3051.0, "completions/mean_terminated_length": 2343.400146484375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.20183175033921302, "frac_reward_zero_std": 0.5, "grad_norm": 0.30042797327041626, "kl": NaN, "learning_rate": 4.061421670117322e-07, "loss": -0.0247, "num_tokens": 20680088.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 595 }, { "completion_length": 1296.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1296.25, "completions/mean_terminated_length": 1296.25, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.2021709633649932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.059696342305038e-07, "loss": 0.0, "num_tokens": 20705621.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 596 }, { "completion_length": 1768.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 4853.0, "completions/max_terminated_length": 4853.0, "completions/mean_length": 1768.0833740234375, "completions/mean_terminated_length": 1768.0833740234375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.2025101763907734, "frac_reward_zero_std": 1.0, "grad_norm": 2.6289086463293643e-07, "kl": 0.0, "learning_rate": 4.057971014492754e-07, "loss": 0.0, "num_tokens": 20738388.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 597 }, { "completion_length": 2543.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4530.0, "completions/max_terminated_length": 4530.0, "completions/mean_length": 2543.0, "completions/mean_terminated_length": 2543.0, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.2028493894165536, "frac_reward_zero_std": 1.0, "grad_norm": 1.6963534221758891e-07, "kl": 0.0, "learning_rate": 4.056245686680469e-07, "loss": 0.0, "num_tokens": 20782416.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 598 }, { "completion_length": 1435.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 1435.3333740234375, "completions/mean_terminated_length": 1435.3333740234375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.20318860244233378, "frac_reward_zero_std": 0.5, "grad_norm": 0.1025226041674614, "kl": 0.0, "learning_rate": 4.054520358868185e-07, "loss": 0.0002, "num_tokens": 20813830.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 599 }, { "completion_length": 794.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 794.75, "completions/mean_terminated_length": 794.75, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.20352781546811397, "frac_reward_zero_std": 0.5, "grad_norm": 0.06588659435510635, "kl": 0.0, "learning_rate": 4.0527950310559005e-07, "loss": 0.0002, "num_tokens": 20837977.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 600 }, { "completion_length": 1211.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1211.3333740234375, "completions/mean_terminated_length": 1211.3333740234375, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 0.20386702849389415, "frac_reward_zero_std": 1.0, "grad_norm": 1.0307384457064472e-07, "kl": 0.0, "learning_rate": 4.051069703243616e-07, "loss": 0.0, "num_tokens": 20861033.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 601 }, { "completion_length": 2811.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6511.0, "completions/mean_length": 3360.5, "completions/mean_terminated_length": 3067.0, "completions/min_length": 1628.0, "completions/min_terminated_length": 1628.0, "epoch": 0.20420624151967434, "frac_reward_zero_std": 0.0, "grad_norm": 0.17239151895046234, "kl": NaN, "learning_rate": 4.0493443754313316e-07, "loss": -0.0131, "num_tokens": 20907724.0, "reward": 1.2208333015441895, "reward_std": 0.1265007108449936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 602 }, { "completion_length": 2544.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5494.0, "completions/max_terminated_length": 5494.0, "completions/mean_length": 2544.75, "completions/mean_terminated_length": 2544.75, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.20454545454545456, "frac_reward_zero_std": 0.0, "grad_norm": 0.5830628871917725, "kl": 0.0, "learning_rate": 4.0476190476190476e-07, "loss": -0.0089, "num_tokens": 20947273.0, "reward": 0.9666666984558105, "reward_std": 0.2707287669181824, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 603 }, { "completion_length": 2008.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3416.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 2008.25, "completions/mean_terminated_length": 2008.25, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.20488466757123475, "frac_reward_zero_std": 0.5, "grad_norm": 0.4334089159965515, "kl": 0.0, "learning_rate": 4.045893719806763e-07, "loss": -0.0058, "num_tokens": 20984080.0, "reward": 0.6208333969116211, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 604 }, { "completion_length": 1733.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3784.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 1733.0, "completions/mean_terminated_length": 1733.0, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.20522388059701493, "frac_reward_zero_std": 0.5, "grad_norm": 0.4287903904914856, "kl": 0.0, "learning_rate": 4.0441683919944787e-07, "loss": 0.0233, "num_tokens": 21015982.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 605 }, { "completion_length": 2514.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4962.0, "completions/max_terminated_length": 4962.0, "completions/mean_length": 2514.166748046875, "completions/mean_terminated_length": 2514.166748046875, "completions/min_length": 1261.0, "completions/min_terminated_length": 1261.0, "epoch": 0.20556309362279512, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668354272842407, "kl": 0.0, "learning_rate": 4.042443064182194e-07, "loss": 0.03, "num_tokens": 21056592.0, "reward": 1.070833444595337, "reward_std": 0.2486901879310608, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 606 }, { "completion_length": 1062.1666870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 2160.33349609375, "completions/mean_terminated_length": 1274.5999755859375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.2059023066485753, "frac_reward_zero_std": 0.5, "grad_norm": 0.10183367133140564, "kl": NaN, "learning_rate": 4.0407177363699103e-07, "loss": -0.0125, "num_tokens": 21083894.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 607 }, { "completion_length": 2220.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6247.0, "completions/mean_length": 2769.58349609375, "completions/mean_terminated_length": 2422.36376953125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.2062415196743555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11050547659397125, "kl": NaN, "learning_rate": 4.038992408557626e-07, "loss": -0.0143, "num_tokens": 21120482.0, "reward": 0.7416666746139526, "reward_std": 0.1128769963979721, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 608 }, { "completion_length": 1264.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1264.5833740234375, "completions/mean_terminated_length": 1264.5833740234375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.2065807327001357, "frac_reward_zero_std": 1.0, "grad_norm": 1.325391423279143e-07, "kl": 0.0, "learning_rate": 4.0372670807453413e-07, "loss": 0.0, "num_tokens": 21142509.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 609 }, { "completion_length": 1594.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3316.0, "completions/max_terminated_length": 3316.0, "completions/mean_length": 1594.5, "completions/mean_terminated_length": 1594.5, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.20691994572591588, "frac_reward_zero_std": 1.0, "grad_norm": 9.580161020039668e-08, "kl": 0.0, "learning_rate": 4.035541752933057e-07, "loss": 0.0, "num_tokens": 21176565.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 610 }, { "completion_length": 2312.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5161.0, "completions/mean_length": 2862.0, "completions/mean_terminated_length": 2523.181884765625, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 0.20725915875169607, "frac_reward_zero_std": 0.5, "grad_norm": 0.09654112905263901, "kl": NaN, "learning_rate": 4.033816425120773e-07, "loss": -0.0072, "num_tokens": 21219494.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 611 }, { "completion_length": 1352.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 1352.666748046875, "completions/mean_terminated_length": 1352.666748046875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.20759837177747625, "frac_reward_zero_std": 1.0, "grad_norm": 1.9667629658215446e-07, "kl": 0.0, "learning_rate": 4.032091097308488e-07, "loss": 0.0, "num_tokens": 21250126.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 612 }, { "completion_length": 2175.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4093.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2175.58349609375, "completions/mean_terminated_length": 2175.58349609375, "completions/min_length": 1203.0, "completions/min_terminated_length": 1203.0, "epoch": 0.20793758480325644, "frac_reward_zero_std": 0.0, "grad_norm": 0.5248443484306335, "kl": 0.0, "learning_rate": 4.030365769496204e-07, "loss": 0.0088, "num_tokens": 21281873.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 613 }, { "completion_length": 2334.75, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6393.0, "completions/mean_length": 4531.08349609375, "completions/mean_terminated_length": 3502.125, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.20827679782903663, "frac_reward_zero_std": 0.5, "grad_norm": 0.24544914066791534, "kl": NaN, "learning_rate": 4.02864044168392e-07, "loss": -0.0219, "num_tokens": 21318176.0, "reward": 0.6000000834465027, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 614 }, { "completion_length": 606.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 606.3333740234375, "completions/mean_terminated_length": 606.3333740234375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.20861601085481682, "frac_reward_zero_std": 0.5, "grad_norm": 0.37288540601730347, "kl": 0.0, "learning_rate": 4.0269151138716356e-07, "loss": -0.0039, "num_tokens": 21336690.0, "reward": 1.1041667461395264, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 615 }, { "completion_length": 1619.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4832.0, "completions/mean_length": 2168.5, "completions/mean_terminated_length": 1766.636474609375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.208955223880597, "frac_reward_zero_std": 0.5, "grad_norm": 0.07448533922433853, "kl": NaN, "learning_rate": 4.025189786059351e-07, "loss": -0.0086, "num_tokens": 21372959.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 616 }, { "completion_length": 2493.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5083.0, "completions/max_terminated_length": 5083.0, "completions/mean_length": 2493.75, "completions/mean_terminated_length": 2493.75, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.2092944369063772, "frac_reward_zero_std": 1.0, "grad_norm": 1.4945359794182878e-07, "kl": 0.0, "learning_rate": 4.0234644582470666e-07, "loss": 0.0, "num_tokens": 21419780.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 617 }, { "completion_length": 1921.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 1921.0833740234375, "completions/mean_terminated_length": 1921.0833740234375, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 0.20963364993215738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0217391304347827e-07, "loss": 0.0, "num_tokens": 21449523.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 618 }, { "completion_length": 701.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 701.75, "completions/mean_terminated_length": 701.75, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.20997286295793757, "frac_reward_zero_std": 0.5, "grad_norm": 0.060739900916814804, "kl": 0.0, "learning_rate": 4.020013802622498e-07, "loss": 0.0002, "num_tokens": 21467394.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 619 }, { "completion_length": 1590.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3286.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 1590.416748046875, "completions/mean_terminated_length": 1590.416748046875, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.21031207598371776, "frac_reward_zero_std": 0.0, "grad_norm": 0.14383164048194885, "kl": 0.0, "learning_rate": 4.018288474810214e-07, "loss": 0.0018, "num_tokens": 21492803.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 620 }, { "completion_length": 2816.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4851.0, "completions/max_terminated_length": 4851.0, "completions/mean_length": 2816.75, "completions/mean_terminated_length": 2816.75, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.21065128900949798, "frac_reward_zero_std": 0.5, "grad_norm": 0.1251380890607834, "kl": 0.0, "learning_rate": 4.0165631469979293e-07, "loss": -0.0009, "num_tokens": 21541196.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 621 }, { "completion_length": 977.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 977.25, "completions/mean_terminated_length": 977.25, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.21099050203527817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0148378191856453e-07, "loss": 0.0, "num_tokens": 21562223.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 622 }, { "completion_length": 1879.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4423.0, "completions/max_terminated_length": 4423.0, "completions/mean_length": 1879.0833740234375, "completions/mean_terminated_length": 1879.0833740234375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.21132971506105835, "frac_reward_zero_std": 0.0, "grad_norm": 0.135577991604805, "kl": 0.0, "learning_rate": 4.0131124913733603e-07, "loss": -0.0018, "num_tokens": 21602580.0, "reward": 1.1541666984558105, "reward_std": 0.06497842073440552, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 623 }, { "completion_length": 2042.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6396.0, "completions/mean_length": 3689.75, "completions/mean_terminated_length": 2723.333251953125, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 0.21166892808683854, "frac_reward_zero_std": 0.0, "grad_norm": 1.0774798393249512, "kl": NaN, "learning_rate": 4.0113871635610764e-07, "loss": -0.1023, "num_tokens": 21640638.0, "reward": 0.8916667699813843, "reward_std": 0.3968444764614105, "rewards/correctness_reward_func/mean": 0.6666666269302368, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 624 }, { "completion_length": 1797.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4817.0, "completions/max_terminated_length": 4817.0, "completions/mean_length": 1797.5833740234375, "completions/mean_terminated_length": 1797.5833740234375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.21200814111261873, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.009661835748792e-07, "loss": 0.0, "num_tokens": 21676483.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 625 }, { "completion_length": 2595.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5171.0, "completions/max_terminated_length": 5171.0, "completions/mean_length": 2595.0, "completions/mean_terminated_length": 2595.0, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.21234735413839892, "frac_reward_zero_std": 0.0, "grad_norm": 0.4653422236442566, "kl": 0.0, "learning_rate": 4.007936507936508e-07, "loss": -0.0322, "num_tokens": 21721933.0, "reward": 1.1708333492279053, "reward_std": 0.2519000172615051, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 626 }, { "completion_length": 768.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 768.5, "completions/mean_terminated_length": 768.5, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.2126865671641791, "frac_reward_zero_std": 0.5, "grad_norm": 0.05752098560333252, "kl": 0.0, "learning_rate": 4.006211180124223e-07, "loss": -0.0001, "num_tokens": 21742123.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 627 }, { "completion_length": 1799.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3195.0, "completions/max_terminated_length": 3195.0, "completions/mean_length": 1799.0833740234375, "completions/mean_terminated_length": 1799.0833740234375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.2130257801899593, "frac_reward_zero_std": 0.5, "grad_norm": 0.11317743360996246, "kl": 0.0, "learning_rate": 4.004485852311939e-07, "loss": 0.0022, "num_tokens": 21775028.0, "reward": 0.7041667699813843, "reward_std": 0.05571504682302475, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 628 }, { "completion_length": 2136.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4480.0, "completions/max_terminated_length": 4480.0, "completions/mean_length": 2136.58349609375, "completions/mean_terminated_length": 2136.58349609375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.21336499321573948, "frac_reward_zero_std": 1.0, "grad_norm": 1.4856317420708365e-07, "kl": 0.0, "learning_rate": 4.002760524499655e-07, "loss": 0.0, "num_tokens": 21813543.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 629 }, { "completion_length": 1928.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5380.0, "completions/max_terminated_length": 5380.0, "completions/mean_length": 1928.5833740234375, "completions/mean_terminated_length": 1928.5833740234375, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.21370420624151967, "frac_reward_zero_std": 0.5, "grad_norm": 0.1090475544333458, "kl": 0.0, "learning_rate": 4.0010351966873706e-07, "loss": 0.0027, "num_tokens": 21848944.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 630 }, { "completion_length": 1004.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3440.0, "completions/mean_length": 1553.25, "completions/mean_terminated_length": 1095.45458984375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.21404341926729986, "frac_reward_zero_std": 0.5, "grad_norm": 0.3620413541793823, "kl": NaN, "learning_rate": 3.999309868875086e-07, "loss": -0.0181, "num_tokens": 21870372.0, "reward": 1.1583333015441895, "reward_std": 0.25380438566207886, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 631 }, { "completion_length": 2100.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3441.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 2100.75, "completions/mean_terminated_length": 2100.75, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.21438263229308005, "frac_reward_zero_std": 0.0, "grad_norm": 0.1738094985485077, "kl": 0.0, "learning_rate": 3.9975845410628017e-07, "loss": 0.0015, "num_tokens": 21907617.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 632 }, { "completion_length": 3179.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6277.0, "completions/mean_length": 3728.5, "completions/mean_terminated_length": 3468.45458984375, "completions/min_length": 1711.0, "completions/min_terminated_length": 1711.0, "epoch": 0.21472184531886024, "frac_reward_zero_std": 0.5, "grad_norm": 0.8372994661331177, "kl": NaN, "learning_rate": 3.9958592132505177e-07, "loss": -0.0076, "num_tokens": 21959930.0, "reward": 0.8750001788139343, "reward_std": 0.2524876296520233, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 633 }, { "completion_length": 1077.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1077.416748046875, "completions/mean_terminated_length": 1077.416748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.21506105834464043, "frac_reward_zero_std": 1.0, "grad_norm": 2.7163471827407193e-07, "kl": 0.0, "learning_rate": 3.9941338854382327e-07, "loss": 0.0, "num_tokens": 21984481.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 634 }, { "completion_length": 1683.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3120.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 1683.0, "completions/mean_terminated_length": 1683.0, "completions/min_length": 1112.0, "completions/min_terminated_length": 1112.0, "epoch": 0.21540027137042062, "frac_reward_zero_std": 1.0, "grad_norm": 2.3120736614146153e-07, "kl": 0.0, "learning_rate": 3.992408557625949e-07, "loss": 0.0, "num_tokens": 22015363.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 635 }, { "completion_length": 1880.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4715.0, "completions/max_terminated_length": 4715.0, "completions/mean_length": 1880.0, "completions/mean_terminated_length": 1880.0, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.2157394843962008, "frac_reward_zero_std": 1.0, "grad_norm": 2.4236021545220865e-07, "kl": 0.0, "learning_rate": 3.9906832298136643e-07, "loss": 0.0, "num_tokens": 22054771.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 636 }, { "completion_length": 1170.9166870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 4163.0, "completions/mean_length": 4465.4169921875, "completions/mean_terminated_length": 2341.83349609375, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "epoch": 0.216078697421981, "frac_reward_zero_std": 0.5, "grad_norm": 0.13113607466220856, "kl": NaN, "learning_rate": 3.9889579020013804e-07, "loss": -0.0029, "num_tokens": 22081302.0, "reward": 0.6166666746139526, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 637 }, { "completion_length": 1980.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5188.0, "completions/max_terminated_length": 5188.0, "completions/mean_length": 1980.0, "completions/mean_terminated_length": 1980.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.21641791044776118, "frac_reward_zero_std": 0.5, "grad_norm": 0.6408343315124512, "kl": 0.0, "learning_rate": 3.9872325741890954e-07, "loss": -0.0118, "num_tokens": 22119900.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 638 }, { "completion_length": 3710.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6489.0, "completions/max_terminated_length": 6489.0, "completions/mean_length": 3710.666748046875, "completions/mean_terminated_length": 3710.666748046875, "completions/min_length": 2540.0, "completions/min_terminated_length": 2540.0, "epoch": 0.2167571234735414, "frac_reward_zero_std": 0.0, "grad_norm": 1.0129146575927734, "kl": 0.0, "learning_rate": 3.9855072463768114e-07, "loss": 0.016, "num_tokens": 22177430.0, "reward": 1.1000001430511475, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 639 }, { "completion_length": 1299.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1299.416748046875, "completions/mean_terminated_length": 1299.416748046875, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 0.21709633649932158, "frac_reward_zero_std": 0.5, "grad_norm": 0.09300534427165985, "kl": 0.0, "learning_rate": 3.983781918564527e-07, "loss": 0.0, "num_tokens": 22206049.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 640 }, { "completion_length": 3040.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6160.0, "completions/max_terminated_length": 6160.0, "completions/mean_length": 3040.08349609375, "completions/mean_terminated_length": 3040.08349609375, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.21743554952510177, "frac_reward_zero_std": 0.5, "grad_norm": 0.1536818891763687, "kl": 0.0, "learning_rate": 3.982056590752243e-07, "loss": 0.0004, "num_tokens": 22256750.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 641 }, { "completion_length": 2255.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5187.0, "completions/max_terminated_length": 5187.0, "completions/mean_length": 2255.25, "completions/mean_terminated_length": 2255.25, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.21777476255088196, "frac_reward_zero_std": 0.5, "grad_norm": 0.8280144929885864, "kl": 0.0, "learning_rate": 3.980331262939958e-07, "loss": 0.0191, "num_tokens": 22293539.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 642 }, { "completion_length": 1590.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5887.0, "completions/max_terminated_length": 5887.0, "completions/mean_length": 1590.8333740234375, "completions/mean_terminated_length": 1590.8333740234375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.21811397557666215, "frac_reward_zero_std": 0.5, "grad_norm": 0.10805870592594147, "kl": 0.0, "learning_rate": 3.978605935127674e-07, "loss": 0.0101, "num_tokens": 22323081.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 643 }, { "completion_length": 2101.8334350585938, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6169.0, "completions/mean_length": 3749.08349609375, "completions/mean_terminated_length": 2802.444580078125, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.21845318860244234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9981639385223389, "kl": NaN, "learning_rate": 3.97688060731539e-07, "loss": -0.0685, "num_tokens": 22360183.0, "reward": 0.7208334803581238, "reward_std": 0.2441396415233612, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 644 }, { "completion_length": 1810.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4908.0, "completions/max_terminated_length": 4908.0, "completions/mean_length": 1810.5833740234375, "completions/mean_terminated_length": 1810.5833740234375, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.21879240162822253, "frac_reward_zero_std": 0.0, "grad_norm": 0.16437654197216034, "kl": 0.0, "learning_rate": 3.975155279503105e-07, "loss": -0.006, "num_tokens": 22390598.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 645 }, { "completion_length": 1326.6667175292969, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3824.0, "completions/mean_length": 2424.83349609375, "completions/mean_terminated_length": 1592.0, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.21913161465400272, "frac_reward_zero_std": 0.5, "grad_norm": 0.23768624663352966, "kl": NaN, "learning_rate": 3.973429951690821e-07, "loss": -0.0106, "num_tokens": 22418914.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 646 }, { "completion_length": 951.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3743.0, "completions/max_terminated_length": 3743.0, "completions/mean_length": 951.5, "completions/mean_terminated_length": 951.5, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.2194708276797829, "frac_reward_zero_std": 0.0, "grad_norm": 0.10783212631940842, "kl": 0.0, "learning_rate": 3.9717046238785367e-07, "loss": -0.0026, "num_tokens": 22442500.0, "reward": 1.254166603088379, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 647 }, { "completion_length": 1173.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 1173.5, "completions/mean_terminated_length": 1173.5, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.2198100407055631, "frac_reward_zero_std": 1.0, "grad_norm": 1.3344381954993878e-07, "kl": 0.0, "learning_rate": 3.969979296066253e-07, "loss": 0.0, "num_tokens": 22468684.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 648 }, { "completion_length": 1362.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3917.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 1362.166748046875, "completions/mean_terminated_length": 1362.166748046875, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.22014925373134328, "frac_reward_zero_std": 0.5, "grad_norm": 0.06996418535709381, "kl": 0.0, "learning_rate": 3.968253968253968e-07, "loss": -0.0004, "num_tokens": 22499058.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 649 }, { "completion_length": 1067.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 1067.0, "completions/mean_terminated_length": 1067.0, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.22048846675712347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.966528640441684e-07, "loss": 0.0, "num_tokens": 22521366.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 650 }, { "completion_length": 1641.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4213.0, "completions/max_terminated_length": 4213.0, "completions/mean_length": 1641.8333740234375, "completions/mean_terminated_length": 1641.8333740234375, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.22082767978290366, "frac_reward_zero_std": 0.5, "grad_norm": 0.08669435232877731, "kl": 0.0, "learning_rate": 3.9648033126293993e-07, "loss": 0.0011, "num_tokens": 22550260.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 651 }, { "completion_length": 2429.0834350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4880.0, "completions/mean_length": 3527.25, "completions/mean_terminated_length": 2914.900146484375, "completions/min_length": 1257.0, "completions/min_terminated_length": 1257.0, "epoch": 0.22116689280868385, "frac_reward_zero_std": 0.5, "grad_norm": 0.5226991176605225, "kl": NaN, "learning_rate": 3.9630779848171154e-07, "loss": -0.0914, "num_tokens": 22593785.0, "reward": 0.9666668176651001, "reward_std": 0.32506412267684937, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 652 }, { "completion_length": 1602.3333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4316.0, "completions/mean_length": 3798.666748046875, "completions/mean_terminated_length": 2403.5, "completions/min_length": 1219.0, "completions/min_terminated_length": 1219.0, "epoch": 0.22150610583446403, "frac_reward_zero_std": 0.5, "grad_norm": 0.16389794647693634, "kl": NaN, "learning_rate": 3.9613526570048304e-07, "loss": -0.019, "num_tokens": 22626027.0, "reward": 0.20000001788139343, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 653 }, { "completion_length": 970.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 970.8333740234375, "completions/mean_terminated_length": 970.8333740234375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.22184531886024422, "frac_reward_zero_std": 0.5, "grad_norm": 0.08526045829057693, "kl": 0.0, "learning_rate": 3.9596273291925465e-07, "loss": -0.0016, "num_tokens": 22649347.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 654 }, { "completion_length": 2165.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3378.0, "completions/max_terminated_length": 3378.0, "completions/mean_length": 2165.75, "completions/mean_terminated_length": 2165.75, "completions/min_length": 1147.0, "completions/min_terminated_length": 1147.0, "epoch": 0.2221845318860244, "frac_reward_zero_std": 0.5, "grad_norm": 0.11644528061151505, "kl": 0.0, "learning_rate": 3.957902001380262e-07, "loss": 0.0005, "num_tokens": 22690942.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 655 }, { "completion_length": 3226.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4712.0, "completions/max_terminated_length": 4712.0, "completions/mean_length": 3226.916748046875, "completions/mean_terminated_length": 3226.916748046875, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "epoch": 0.2225237449118046, "frac_reward_zero_std": 0.5, "grad_norm": 0.16651302576065063, "kl": 0.0, "learning_rate": 3.9561766735679775e-07, "loss": -0.0028, "num_tokens": 22740867.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 656 }, { "completion_length": 2330.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6434.0, "completions/mean_length": 2879.75, "completions/mean_terminated_length": 2542.54541015625, "completions/min_length": 1151.0, "completions/min_terminated_length": 1151.0, "epoch": 0.22286295793758482, "frac_reward_zero_std": 0.0, "grad_norm": 0.2756352722644806, "kl": NaN, "learning_rate": 3.954451345755693e-07, "loss": -0.0489, "num_tokens": 22779851.0, "reward": 1.0750000476837158, "reward_std": 0.29088661074638367, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 657 }, { "completion_length": 678.5833435058594, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4787.0, "completions/mean_length": 3424.0, "completions/mean_terminated_length": 1163.2857666015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.223202170963365, "frac_reward_zero_std": 0.0, "grad_norm": 0.3502046465873718, "kl": NaN, "learning_rate": 3.952726017943409e-07, "loss": -0.0154, "num_tokens": 22801434.0, "reward": 0.6083333492279053, "reward_std": 0.1128770112991333, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 658 }, { "completion_length": 1686.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5783.0, "completions/max_terminated_length": 5783.0, "completions/mean_length": 1686.75, "completions/mean_terminated_length": 1686.75, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.2235413839891452, "frac_reward_zero_std": 0.0, "grad_norm": 0.9250512719154358, "kl": 0.0, "learning_rate": 3.951000690131125e-07, "loss": 0.0408, "num_tokens": 22831161.0, "reward": 1.0833333730697632, "reward_std": 0.3129711151123047, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 659 }, { "completion_length": 2479.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5375.0, "completions/max_terminated_length": 5375.0, "completions/mean_length": 2479.5, "completions/mean_terminated_length": 2479.5, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.22388059701492538, "frac_reward_zero_std": 0.5, "grad_norm": 0.5752743482589722, "kl": 0.0, "learning_rate": 3.94927536231884e-07, "loss": 0.0219, "num_tokens": 22872837.0, "reward": 1.149999976158142, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 660 }, { "completion_length": 3482.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5768.0, "completions/max_terminated_length": 5768.0, "completions/mean_length": 3482.916748046875, "completions/mean_terminated_length": 3482.916748046875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.22421981004070557, "frac_reward_zero_std": 0.5, "grad_norm": 0.8260320425033569, "kl": 0.0, "learning_rate": 3.947550034506556e-07, "loss": -0.0385, "num_tokens": 22924898.0, "reward": 1.0166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 661 }, { "completion_length": 1033.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 1033.916748046875, "completions/mean_terminated_length": 1033.916748046875, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.22455902306648576, "frac_reward_zero_std": 0.5, "grad_norm": 0.404193252325058, "kl": 0.0, "learning_rate": 3.945824706694272e-07, "loss": -0.0029, "num_tokens": 22953091.0, "reward": 1.058333396911621, "reward_std": 0.2610875070095062, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 662 }, { "completion_length": 1029.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 1029.3333740234375, "completions/mean_terminated_length": 1029.3333740234375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.22489823609226595, "frac_reward_zero_std": 1.0, "grad_norm": 1.0883605483513747e-07, "kl": 0.0, "learning_rate": 3.944099378881988e-07, "loss": 0.0, "num_tokens": 22976513.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 663 }, { "completion_length": 670.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 670.0, "completions/mean_terminated_length": 670.0, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.22523744911804613, "frac_reward_zero_std": 0.5, "grad_norm": 0.06571773439645767, "kl": 0.0, "learning_rate": 3.942374051069703e-07, "loss": -0.0005, "num_tokens": 22999475.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 664 }, { "completion_length": 2176.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6299.0, "completions/mean_length": 3275.0, "completions/mean_terminated_length": 2612.199951171875, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.22557666214382632, "frac_reward_zero_std": 0.5, "grad_norm": 0.5015944838523865, "kl": NaN, "learning_rate": 3.940648723257419e-07, "loss": 0.0043, "num_tokens": 23037429.0, "reward": 0.8166667819023132, "reward_std": 0.30441200733184814, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 665 }, { "completion_length": 2563.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5240.0, "completions/max_terminated_length": 5240.0, "completions/mean_length": 2563.0, "completions/mean_terminated_length": 2563.0, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "epoch": 0.2259158751696065, "frac_reward_zero_std": 0.5, "grad_norm": 0.5822378396987915, "kl": 0.0, "learning_rate": 3.9389233954451344e-07, "loss": 0.026, "num_tokens": 23078463.0, "reward": 1.066666603088379, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 666 }, { "completion_length": 2461.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4158.0, "completions/max_terminated_length": 4158.0, "completions/mean_length": 2461.33349609375, "completions/mean_terminated_length": 2461.33349609375, "completions/min_length": 1184.0, "completions/min_terminated_length": 1184.0, "epoch": 0.2262550881953867, "frac_reward_zero_std": 1.0, "grad_norm": 1.4431849137963582e-07, "kl": 0.0, "learning_rate": 3.9371980676328504e-07, "loss": 0.0, "num_tokens": 23119081.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 667 }, { "completion_length": 1863.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4872.0, "completions/max_terminated_length": 4872.0, "completions/mean_length": 1863.3333740234375, "completions/mean_terminated_length": 1863.3333740234375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.2265943012211669, "frac_reward_zero_std": 0.0, "grad_norm": 0.14633503556251526, "kl": 0.0, "learning_rate": 3.9354727398205654e-07, "loss": 0.0029, "num_tokens": 23151605.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 668 }, { "completion_length": 1864.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4246.0, "completions/mean_length": 2413.916748046875, "completions/mean_terminated_length": 2034.3636474609375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.22693351424694708, "frac_reward_zero_std": 0.0, "grad_norm": 0.5761399269104004, "kl": NaN, "learning_rate": 3.9337474120082815e-07, "loss": -0.0322, "num_tokens": 23187561.0, "reward": 0.8250000476837158, "reward_std": 0.27409863471984863, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 669 }, { "completion_length": 1130.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1130.916748046875, "completions/mean_terminated_length": 1130.916748046875, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.22727272727272727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.932022084195997e-07, "loss": 0.0, "num_tokens": 23216174.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 670 }, { "completion_length": 1128.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2095.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 1128.416748046875, "completions/mean_terminated_length": 1128.416748046875, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.22761194029850745, "frac_reward_zero_std": 0.5, "grad_norm": 0.09337574988603592, "kl": 0.0, "learning_rate": 3.9302967563837126e-07, "loss": 0.0024, "num_tokens": 23244121.0, "reward": 0.75, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 671 }, { "completion_length": 1535.9166870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 5545.0, "completions/mean_length": 4830.4169921875, "completions/mean_terminated_length": 3071.83349609375, "completions/min_length": 1840.0, "completions/min_terminated_length": 1840.0, "epoch": 0.22795115332428764, "frac_reward_zero_std": 0.5, "grad_norm": 0.6959670782089233, "kl": NaN, "learning_rate": 3.928571428571428e-07, "loss": -0.0068, "num_tokens": 23274990.0, "reward": 0.28333336114883423, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 672 }, { "completion_length": 1835.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3286.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 1835.8333740234375, "completions/mean_terminated_length": 1835.8333740234375, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.22829036635006783, "frac_reward_zero_std": 0.5, "grad_norm": 0.10714925825595856, "kl": 0.0, "learning_rate": 3.926846100759144e-07, "loss": 0.0022, "num_tokens": 23310532.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 673 }, { "completion_length": 1179.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 1179.166748046875, "completions/mean_terminated_length": 1179.166748046875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.22862957937584802, "frac_reward_zero_std": 1.0, "grad_norm": 2.31352046853317e-07, "kl": 0.0, "learning_rate": 3.92512077294686e-07, "loss": 0.0, "num_tokens": 23339454.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 674 }, { "completion_length": 2697.08349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5319.0, "completions/mean_length": 3246.166748046875, "completions/mean_terminated_length": 2942.272705078125, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 0.22896879240162823, "frac_reward_zero_std": 0.5, "grad_norm": 0.6637680530548096, "kl": NaN, "learning_rate": 3.923395445134575e-07, "loss": -0.0302, "num_tokens": 23380981.0, "reward": 0.8583332896232605, "reward_std": 0.22453653812408447, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 675 }, { "completion_length": 1928.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3160.0, "completions/max_terminated_length": 3160.0, "completions/mean_length": 1928.8333740234375, "completions/mean_terminated_length": 1928.8333740234375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.22930800542740842, "frac_reward_zero_std": 0.5, "grad_norm": 0.10171937942504883, "kl": 0.0, "learning_rate": 3.921670117322291e-07, "loss": -0.0015, "num_tokens": 23414105.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 676 }, { "completion_length": 547.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 547.8333740234375, "completions/mean_terminated_length": 547.8333740234375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.2296472184531886, "frac_reward_zero_std": 0.5, "grad_norm": 0.059833355247974396, "kl": 0.0, "learning_rate": 3.919944789510007e-07, "loss": 0.0003, "num_tokens": 23431623.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 677 }, { "completion_length": 1936.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4324.0, "completions/max_terminated_length": 4324.0, "completions/mean_length": 1936.75, "completions/mean_terminated_length": 1936.75, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 0.2299864314789688, "frac_reward_zero_std": 0.5, "grad_norm": 0.4380330443382263, "kl": 0.0, "learning_rate": 3.918219461697723e-07, "loss": -0.0158, "num_tokens": 23468760.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 678 }, { "completion_length": 910.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 910.4166870117188, "completions/mean_terminated_length": 910.4166870117188, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.230325644504749, "frac_reward_zero_std": 1.0, "grad_norm": 1.0412031770101748e-07, "kl": 0.0, "learning_rate": 3.916494133885438e-07, "loss": 0.0, "num_tokens": 23489765.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 679 }, { "completion_length": 1419.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 1419.8333740234375, "completions/mean_terminated_length": 1419.8333740234375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.23066485753052918, "frac_reward_zero_std": 0.5, "grad_norm": 0.11209948360919952, "kl": 0.0, "learning_rate": 3.914768806073154e-07, "loss": -0.0005, "num_tokens": 23520645.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 680 }, { "completion_length": 2968.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6533.0, "completions/mean_length": 4066.916748046875, "completions/mean_terminated_length": 3562.5, "completions/min_length": 2128.0, "completions/min_terminated_length": 2128.0, "epoch": 0.23100407055630937, "frac_reward_zero_std": 0.5, "grad_norm": 0.10447150468826294, "kl": NaN, "learning_rate": 3.9130434782608694e-07, "loss": -0.0173, "num_tokens": 23564442.0, "reward": 0.6625000834465027, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 681 }, { "completion_length": 731.0833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 2927.416748046875, "completions/mean_terminated_length": 1096.625, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.23134328358208955, "frac_reward_zero_std": 0.5, "grad_norm": 0.145811066031456, "kl": NaN, "learning_rate": 3.911318150448585e-07, "loss": -0.0086, "num_tokens": 23589685.0, "reward": 0.699999988079071, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 682 }, { "completion_length": 841.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 841.0, "completions/mean_terminated_length": 841.0, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.23168249660786974, "frac_reward_zero_std": 0.5, "grad_norm": 0.2751244008541107, "kl": 0.0, "learning_rate": 3.9095928226363005e-07, "loss": -0.0016, "num_tokens": 23610985.0, "reward": 0.9666668176651001, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 683 }, { "completion_length": 2837.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5740.0, "completions/max_terminated_length": 5740.0, "completions/mean_length": 2837.666748046875, "completions/mean_terminated_length": 2837.666748046875, "completions/min_length": 1397.0, "completions/min_terminated_length": 1397.0, "epoch": 0.23202170963364993, "frac_reward_zero_std": 0.5, "grad_norm": 0.0925978422164917, "kl": 0.0, "learning_rate": 3.9078674948240165e-07, "loss": -0.0017, "num_tokens": 23660457.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 684 }, { "completion_length": 1592.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4128.0, "completions/max_terminated_length": 4128.0, "completions/mean_length": 1592.0, "completions/mean_terminated_length": 1592.0, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.23236092265943012, "frac_reward_zero_std": 0.5, "grad_norm": 0.6901833415031433, "kl": 0.0, "learning_rate": 3.906142167011732e-07, "loss": -0.0094, "num_tokens": 23688075.0, "reward": 0.9666666388511658, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 685 }, { "completion_length": 566.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 566.25, "completions/mean_terminated_length": 566.25, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.2327001356852103, "frac_reward_zero_std": 0.0, "grad_norm": 0.23003122210502625, "kl": 0.0, "learning_rate": 3.9044168391994476e-07, "loss": 0.0012, "num_tokens": 23707470.0, "reward": 1.1166666746139526, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 686 }, { "completion_length": 1287.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1287.666748046875, "completions/mean_terminated_length": 1287.666748046875, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 0.2330393487109905, "frac_reward_zero_std": 0.5, "grad_norm": 0.09936317056417465, "kl": 0.0, "learning_rate": 3.902691511387163e-07, "loss": 0.0, "num_tokens": 23736716.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 687 }, { "completion_length": 1156.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4769.0, "completions/max_terminated_length": 4769.0, "completions/mean_length": 1156.25, "completions/mean_terminated_length": 1156.25, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.23337856173677068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.900966183574879e-07, "loss": 0.0, "num_tokens": 23758805.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 688 }, { "completion_length": 1892.5000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6275.0, "completions/mean_length": 2441.58349609375, "completions/mean_terminated_length": 2064.54541015625, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.23371777476255087, "frac_reward_zero_std": 0.0, "grad_norm": 0.6890511512756348, "kl": NaN, "learning_rate": 3.8992408557625947e-07, "loss": -0.0588, "num_tokens": 23796545.0, "reward": 1.0375001430511475, "reward_std": 0.2679903209209442, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 689 }, { "completion_length": 2770.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4142.0, "completions/max_terminated_length": 4142.0, "completions/mean_length": 2770.33349609375, "completions/mean_terminated_length": 2770.33349609375, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.23405698778833106, "frac_reward_zero_std": 0.0, "grad_norm": 0.734200656414032, "kl": 0.0, "learning_rate": 3.89751552795031e-07, "loss": -0.0003, "num_tokens": 23845119.0, "reward": 1.0166666507720947, "reward_std": 0.440767377614975, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 690 }, { "completion_length": 1363.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1363.5833740234375, "completions/mean_terminated_length": 1363.5833740234375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.23439620081411125, "frac_reward_zero_std": 1.0, "grad_norm": 2.2079890982240613e-07, "kl": 0.0, "learning_rate": 3.8957902001380263e-07, "loss": 0.0, "num_tokens": 23873902.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 691 }, { "completion_length": 822.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 822.8333740234375, "completions/mean_terminated_length": 822.8333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.23473541383989144, "frac_reward_zero_std": 0.0, "grad_norm": 0.10281941294670105, "kl": 0.0, "learning_rate": 3.894064872325742e-07, "loss": -0.0002, "num_tokens": 23895980.0, "reward": 1.1500000953674316, "reward_std": 0.09246459603309631, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 692 }, { "completion_length": 699.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 699.5833740234375, "completions/mean_terminated_length": 699.5833740234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.23507462686567165, "frac_reward_zero_std": 0.5, "grad_norm": 0.28774434328079224, "kl": 0.0, "learning_rate": 3.8923395445134574e-07, "loss": -0.0007, "num_tokens": 23916567.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 693 }, { "completion_length": 1013.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 1013.0, "completions/mean_terminated_length": 1013.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.23541383989145184, "frac_reward_zero_std": 0.5, "grad_norm": 0.07994963973760605, "kl": 0.0, "learning_rate": 3.890614216701173e-07, "loss": 0.0024, "num_tokens": 23944047.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 694 }, { "completion_length": 1904.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3562.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 1904.916748046875, "completions/mean_terminated_length": 1904.916748046875, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.23575305291723203, "frac_reward_zero_std": 0.5, "grad_norm": 0.09583219140768051, "kl": 0.0, "learning_rate": 3.888888888888889e-07, "loss": -0.0006, "num_tokens": 23977784.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 695 }, { "completion_length": 876.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 876.8333740234375, "completions/mean_terminated_length": 876.8333740234375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.23609226594301222, "frac_reward_zero_std": 1.0, "grad_norm": 7.380832300896145e-08, "kl": 0.0, "learning_rate": 3.8871635610766045e-07, "loss": 0.0, "num_tokens": 24001866.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 696 }, { "completion_length": 1457.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3931.0, "completions/max_terminated_length": 3931.0, "completions/mean_length": 1457.5, "completions/mean_terminated_length": 1457.5, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.2364314789687924, "frac_reward_zero_std": 0.5, "grad_norm": 0.5751703381538391, "kl": 0.0, "learning_rate": 3.88543823326432e-07, "loss": -0.022, "num_tokens": 24030342.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 697 }, { "completion_length": 1276.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1276.666748046875, "completions/mean_terminated_length": 1276.666748046875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.2367706919945726, "frac_reward_zero_std": 0.0, "grad_norm": 0.5665131211280823, "kl": 0.0, "learning_rate": 3.8837129054520355e-07, "loss": -0.0076, "num_tokens": 24057146.0, "reward": 1.066666841506958, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 698 }, { "completion_length": 1514.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3280.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 1514.3333740234375, "completions/mean_terminated_length": 1514.3333740234375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.23710990502035278, "frac_reward_zero_std": 0.0, "grad_norm": 0.4253982901573181, "kl": 0.0, "learning_rate": 3.8819875776397516e-07, "loss": 0.0088, "num_tokens": 24085560.0, "reward": 0.7000000476837158, "reward_std": 0.3265986442565918, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 699 }, { "completion_length": 1934.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5187.0, "completions/mean_length": 3032.75, "completions/mean_terminated_length": 2321.5, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.23744911804613297, "frac_reward_zero_std": 0.5, "grad_norm": 0.4350591003894806, "kl": NaN, "learning_rate": 3.880262249827467e-07, "loss": -0.0673, "num_tokens": 24118171.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 700 }, { "completion_length": 1958.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4626.0, "completions/max_terminated_length": 4626.0, "completions/mean_length": 1958.0, "completions/mean_terminated_length": 1958.0, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.23778833107191316, "frac_reward_zero_std": 0.0, "grad_norm": 0.35070163011550903, "kl": 0.0, "learning_rate": 3.8785369220151826e-07, "loss": 0.0025, "num_tokens": 24153667.0, "reward": 1.1000001430511475, "reward_std": 0.23490385711193085, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 701 }, { "completion_length": 1124.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 1124.5, "completions/mean_terminated_length": 1124.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.23812754409769335, "frac_reward_zero_std": 0.5, "grad_norm": 0.11582597345113754, "kl": 0.0, "learning_rate": 3.876811594202898e-07, "loss": -0.0012, "num_tokens": 24179749.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 702 }, { "completion_length": 1943.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4035.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 1943.25, "completions/mean_terminated_length": 1943.25, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.23846675712347354, "frac_reward_zero_std": 0.5, "grad_norm": 0.11352552473545074, "kl": 0.0, "learning_rate": 3.875086266390614e-07, "loss": 0.0012, "num_tokens": 24213964.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 703 }, { "completion_length": 1198.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1198.25, "completions/mean_terminated_length": 1198.25, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.23880597014925373, "frac_reward_zero_std": 0.5, "grad_norm": 0.07122494280338287, "kl": 0.0, "learning_rate": 3.873360938578329e-07, "loss": 0.001, "num_tokens": 24244057.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 704 }, { "completion_length": 2193.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 2742.75, "completions/mean_terminated_length": 2393.091064453125, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.23914518317503392, "frac_reward_zero_std": 0.5, "grad_norm": 0.09500417858362198, "kl": NaN, "learning_rate": 3.8716356107660453e-07, "loss": -0.0105, "num_tokens": 24282195.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 705 }, { "completion_length": 2630.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5251.0, "completions/max_terminated_length": 5251.0, "completions/mean_length": 2630.58349609375, "completions/mean_terminated_length": 2630.58349609375, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.2394843962008141, "frac_reward_zero_std": 0.0, "grad_norm": 0.9753767848014832, "kl": 0.0, "learning_rate": 3.8699102829537613e-07, "loss": -0.0168, "num_tokens": 24327838.0, "reward": 1.1000001430511475, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 706 }, { "completion_length": 1942.7500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4530.0, "completions/mean_length": 2491.83349609375, "completions/mean_terminated_length": 2119.36376953125, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.2398236092265943, "frac_reward_zero_std": 0.0, "grad_norm": 0.2561108469963074, "kl": NaN, "learning_rate": 3.868184955141477e-07, "loss": -0.0457, "num_tokens": 24366193.0, "reward": 1.0416667461395264, "reward_std": 0.2761763334274292, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 707 }, { "completion_length": 1299.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1299.0, "completions/mean_terminated_length": 1299.0, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.24016282225237448, "frac_reward_zero_std": 0.5, "grad_norm": 0.11090793460607529, "kl": 0.0, "learning_rate": 3.8664596273291924e-07, "loss": -0.0001, "num_tokens": 24393121.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 708 }, { "completion_length": 2367.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5939.0, "completions/max_terminated_length": 5939.0, "completions/mean_length": 2367.75, "completions/mean_terminated_length": 2367.75, "completions/min_length": 1082.0, "completions/min_terminated_length": 1082.0, "epoch": 0.24050203527815467, "frac_reward_zero_std": 1.0, "grad_norm": 3.2568166830060363e-07, "kl": 0.0, "learning_rate": 3.864734299516908e-07, "loss": 0.0, "num_tokens": 24432862.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 709 }, { "completion_length": 1470.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 1470.666748046875, "completions/mean_terminated_length": 1470.666748046875, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.24084124830393486, "frac_reward_zero_std": 1.0, "grad_norm": 2.8426899234546e-07, "kl": 0.0, "learning_rate": 3.863008971704624e-07, "loss": 0.0, "num_tokens": 24462162.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 710 }, { "completion_length": 2114.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6433.0, "completions/max_terminated_length": 6433.0, "completions/mean_length": 2114.25, "completions/mean_terminated_length": 2114.25, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.24118046132971507, "frac_reward_zero_std": 0.5, "grad_norm": 0.05685143172740936, "kl": 0.0, "learning_rate": 3.8612836438923395e-07, "loss": 0.0, "num_tokens": 24495963.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 711 }, { "completion_length": 929.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 929.1666870117188, "completions/mean_terminated_length": 929.1666870117188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.24151967435549526, "frac_reward_zero_std": 0.5, "grad_norm": 0.06048990786075592, "kl": 0.0, "learning_rate": 3.859558316080055e-07, "loss": -0.0, "num_tokens": 24517901.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 712 }, { "completion_length": 1756.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3761.0, "completions/max_terminated_length": 3761.0, "completions/mean_length": 1756.3333740234375, "completions/mean_terminated_length": 1756.3333740234375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.24185888738127545, "frac_reward_zero_std": 0.5, "grad_norm": 0.14410604536533356, "kl": 0.0, "learning_rate": 3.8578329882677706e-07, "loss": 0.0002, "num_tokens": 24553629.0, "reward": 1.1541666984558105, "reward_std": 0.07486096024513245, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 713 }, { "completion_length": 2858.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5566.0, "completions/mean_length": 3407.58349609375, "completions/mean_terminated_length": 3118.36376953125, "completions/min_length": 1465.0, "completions/min_terminated_length": 1465.0, "epoch": 0.24219810040705564, "frac_reward_zero_std": 0.0, "grad_norm": 0.2787613272666931, "kl": NaN, "learning_rate": 3.8561076604554866e-07, "loss": -0.0455, "num_tokens": 24603819.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 714 }, { "completion_length": 2805.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6465.0, "completions/max_terminated_length": 6465.0, "completions/mean_length": 2805.5, "completions/mean_terminated_length": 2805.5, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.24253731343283583, "frac_reward_zero_std": 0.5, "grad_norm": 0.48999252915382385, "kl": 0.0, "learning_rate": 3.8543823326432016e-07, "loss": 0.0095, "num_tokens": 24648591.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 715 }, { "completion_length": 2720.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5956.0, "completions/max_terminated_length": 5956.0, "completions/mean_length": 2720.166748046875, "completions/mean_terminated_length": 2720.166748046875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.24287652645861602, "frac_reward_zero_std": 0.5, "grad_norm": 0.42055872082710266, "kl": 0.0, "learning_rate": 3.8526570048309177e-07, "loss": -0.0159, "num_tokens": 24692747.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 716 }, { "completion_length": 1547.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 1547.416748046875, "completions/mean_terminated_length": 1547.416748046875, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.2432157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.07790729403495789, "kl": 0.0, "learning_rate": 3.850931677018633e-07, "loss": -0.0014, "num_tokens": 24721240.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 717 }, { "completion_length": 1684.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4692.0, "completions/max_terminated_length": 4692.0, "completions/mean_length": 1684.5, "completions/mean_terminated_length": 1684.5, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.2435549525101764, "frac_reward_zero_std": 0.5, "grad_norm": 0.09478975832462311, "kl": 0.0, "learning_rate": 3.8492063492063493e-07, "loss": -0.0023, "num_tokens": 24754684.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 718 }, { "completion_length": 3755.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6407.0, "completions/max_terminated_length": 6407.0, "completions/mean_length": 3755.0, "completions/mean_terminated_length": 3755.0, "completions/min_length": 2256.0, "completions/min_terminated_length": 2256.0, "epoch": 0.24389416553595658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.8474810213940643e-07, "loss": 0.0, "num_tokens": 24810616.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 719 }, { "completion_length": 1573.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 1573.3333740234375, "completions/mean_terminated_length": 1573.3333740234375, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.24423337856173677, "frac_reward_zero_std": 0.5, "grad_norm": 0.08259549736976624, "kl": 0.0, "learning_rate": 3.8457556935817803e-07, "loss": -0.0009, "num_tokens": 24840470.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 720 }, { "completion_length": 1527.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4029.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1527.416748046875, "completions/mean_terminated_length": 1527.416748046875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 0.24457259158751696, "frac_reward_zero_std": 0.5, "grad_norm": 0.507327139377594, "kl": 0.0, "learning_rate": 3.8440303657694964e-07, "loss": -0.022, "num_tokens": 24872401.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 721 }, { "completion_length": 2963.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4433.0, "completions/max_terminated_length": 4433.0, "completions/mean_length": 2963.916748046875, "completions/mean_terminated_length": 2963.916748046875, "completions/min_length": 1830.0, "completions/min_terminated_length": 1830.0, "epoch": 0.24491180461329715, "frac_reward_zero_std": 0.0, "grad_norm": 0.5084398984909058, "kl": 0.0, "learning_rate": 3.842305037957212e-07, "loss": 0.0097, "num_tokens": 24920676.0, "reward": 1.149999976158142, "reward_std": 0.2270146608352661, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 722 }, { "completion_length": 2600.0833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5634.0, "completions/mean_length": 4247.33349609375, "completions/mean_terminated_length": 3466.77783203125, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 0.24525101763907733, "frac_reward_zero_std": 0.0, "grad_norm": 1.0371900796890259, "kl": NaN, "learning_rate": 3.8405797101449274e-07, "loss": -0.0943, "num_tokens": 24960865.0, "reward": 0.7083333730697632, "reward_std": 0.4670211672782898, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 723 }, { "completion_length": 526.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 526.3333740234375, "completions/mean_terminated_length": 526.3333740234375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.24559023066485752, "frac_reward_zero_std": 1.0, "grad_norm": 7.004301494362153e-08, "kl": 0.0, "learning_rate": 3.838854382332643e-07, "loss": 0.0, "num_tokens": 24981371.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 724 }, { "completion_length": 1995.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4928.0, "completions/mean_length": 2544.25, "completions/mean_terminated_length": 2176.54541015625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.2459294436906377, "frac_reward_zero_std": 0.5, "grad_norm": 0.07325784862041473, "kl": NaN, "learning_rate": 3.837129054520359e-07, "loss": -0.0124, "num_tokens": 25017247.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 725 }, { "completion_length": 1874.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4024.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 1874.3333740234375, "completions/mean_terminated_length": 1874.3333740234375, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.2462686567164179, "frac_reward_zero_std": 1.0, "grad_norm": 1.9815544760604098e-07, "kl": 0.0, "learning_rate": 3.835403726708074e-07, "loss": 0.0, "num_tokens": 25055375.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 726 }, { "completion_length": 1643.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5250.0, "completions/max_terminated_length": 5250.0, "completions/mean_length": 1643.5833740234375, "completions/mean_terminated_length": 1643.5833740234375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.2466078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 2.3413605276800808e-07, "kl": 0.0, "learning_rate": 3.83367839889579e-07, "loss": 0.0, "num_tokens": 25086828.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 727 }, { "completion_length": 2202.2500610351562, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 6589.0, "completions/max_terminated_length": 6381.0, "completions/mean_length": 6045.83349609375, "completions/mean_terminated_length": 5285.39990234375, "completions/min_length": 3332.0, "completions/min_terminated_length": 3332.0, "epoch": 0.24694708276797828, "frac_reward_zero_std": 0.0, "grad_norm": 0.7284405827522278, "kl": NaN, "learning_rate": 3.8319530710835056e-07, "loss": -0.1411, "num_tokens": 25121163.0, "reward": 0.33750003576278687, "reward_std": 0.37498682737350464, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.13750000298023224, "rewards/format_reward_func/std": 0.14943073689937592, "step": 728 }, { "completion_length": 1111.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 1111.3333740234375, "completions/mean_terminated_length": 1111.3333740234375, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.2472862957937585, "frac_reward_zero_std": 1.0, "grad_norm": 1.3690761591078626e-07, "kl": 0.0, "learning_rate": 3.8302277432712217e-07, "loss": 0.0, "num_tokens": 25146583.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 729 }, { "completion_length": 3647.250244140625, "completions/clipped_ratio": 0.0, "completions/max_length": 5731.0, "completions/max_terminated_length": 5731.0, "completions/mean_length": 3647.25, "completions/mean_terminated_length": 3647.25, "completions/min_length": 1535.0, "completions/min_terminated_length": 1535.0, "epoch": 0.24762550881953868, "frac_reward_zero_std": 0.0, "grad_norm": 0.6610546708106995, "kl": 0.0, "learning_rate": 3.8285024154589367e-07, "loss": -0.0118, "num_tokens": 25205158.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029239773750305, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 730 }, { "completion_length": 2499.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6543.0, "completions/max_terminated_length": 6543.0, "completions/mean_length": 2499.666748046875, "completions/mean_terminated_length": 2499.666748046875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.24796472184531887, "frac_reward_zero_std": 0.5, "grad_norm": 0.07455330342054367, "kl": 0.0, "learning_rate": 3.8267770876466527e-07, "loss": -0.0011, "num_tokens": 25245858.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 731 }, { "completion_length": 1155.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 1155.25, "completions/mean_terminated_length": 1155.25, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.24830393487109906, "frac_reward_zero_std": 0.5, "grad_norm": 0.08911774307489395, "kl": 0.0, "learning_rate": 3.825051759834368e-07, "loss": 0.0029, "num_tokens": 25270719.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 732 }, { "completion_length": 1258.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 1258.8333740234375, "completions/mean_terminated_length": 1258.8333740234375, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.24864314789687925, "frac_reward_zero_std": 0.5, "grad_norm": 0.05099448561668396, "kl": 0.0, "learning_rate": 3.8233264320220843e-07, "loss": -0.0012, "num_tokens": 25292191.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 733 }, { "completion_length": 2122.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 2122.75, "completions/mean_terminated_length": 2122.75, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.24898236092265943, "frac_reward_zero_std": 0.5, "grad_norm": 0.335657000541687, "kl": 0.0, "learning_rate": 3.8216011042097993e-07, "loss": 0.0078, "num_tokens": 25329844.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 734 }, { "completion_length": 2080.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5717.0, "completions/max_terminated_length": 5717.0, "completions/mean_length": 2080.75, "completions/mean_terminated_length": 2080.75, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.24932157394843962, "frac_reward_zero_std": 1.0, "grad_norm": 3.0757533409087046e-07, "kl": 0.0, "learning_rate": 3.8198757763975154e-07, "loss": 0.0, "num_tokens": 25368103.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 735 }, { "completion_length": 2086.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5375.0, "completions/mean_length": 2635.58349609375, "completions/mean_terminated_length": 2276.181884765625, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.2496607869742198, "frac_reward_zero_std": 0.0, "grad_norm": 0.5441375374794006, "kl": NaN, "learning_rate": 3.818150448585231e-07, "loss": 0.0135, "num_tokens": 25404553.0, "reward": 0.5416666865348816, "reward_std": 0.2677963674068451, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 736 }, { "completion_length": 1710.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3985.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 1710.916748046875, "completions/mean_terminated_length": 1710.916748046875, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.25, "frac_reward_zero_std": 0.5, "grad_norm": 0.43407583236694336, "kl": 0.0, "learning_rate": 3.8164251207729464e-07, "loss": 0.0034, "num_tokens": 25435602.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 737 }, { "completion_length": 1790.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4435.0, "completions/mean_length": 2339.58349609375, "completions/mean_terminated_length": 1953.2728271484375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.2503392130257802, "frac_reward_zero_std": 0.0, "grad_norm": 0.07025188952684402, "kl": NaN, "learning_rate": 3.8146997929606625e-07, "loss": -0.0104, "num_tokens": 25472916.0, "reward": 0.26250001788139343, "reward_std": 0.09185586869716644, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 738 }, { "completion_length": 1810.4166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5111.0, "completions/mean_length": 2908.58349609375, "completions/mean_terminated_length": 2172.5, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 0.2506784260515604, "frac_reward_zero_std": 0.5, "grad_norm": 0.8079109191894531, "kl": NaN, "learning_rate": 3.812974465148378e-07, "loss": -0.0606, "num_tokens": 25503497.0, "reward": 0.5166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 739 }, { "completion_length": 861.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 861.6666870117188, "completions/mean_terminated_length": 861.6666870117188, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.2510176390773406, "frac_reward_zero_std": 1.0, "grad_norm": 2.0568649006236228e-07, "kl": 0.0, "learning_rate": 3.811249137336094e-07, "loss": 0.0, "num_tokens": 25525681.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 740 }, { "completion_length": 2058.916748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6243.0, "completions/mean_length": 4255.25, "completions/mean_terminated_length": 3088.375, "completions/min_length": 1242.0, "completions/min_terminated_length": 1242.0, "epoch": 0.25135685210312075, "frac_reward_zero_std": 0.0, "grad_norm": 1.2331775426864624, "kl": NaN, "learning_rate": 3.809523809523809e-07, "loss": -0.1065, "num_tokens": 25565538.0, "reward": 0.6666666865348816, "reward_std": 0.4954916834831238, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 741 }, { "completion_length": 1583.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1583.0, "completions/mean_terminated_length": 1583.0, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.25169606512890097, "frac_reward_zero_std": 0.5, "grad_norm": 0.3111180067062378, "kl": 0.0, "learning_rate": 3.807798481711525e-07, "loss": -0.0014, "num_tokens": 25600728.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 742 }, { "completion_length": 2449.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4081.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 2449.83349609375, "completions/mean_terminated_length": 2449.83349609375, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.25203527815468113, "frac_reward_zero_std": 1.0, "grad_norm": 2.9365247655732674e-07, "kl": 0.0, "learning_rate": 3.8060731538992407e-07, "loss": 0.0, "num_tokens": 25642138.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 743 }, { "completion_length": 1862.6666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5477.0, "completions/mean_length": 2411.75, "completions/mean_terminated_length": 2032.0, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "epoch": 0.25237449118046135, "frac_reward_zero_std": 0.5, "grad_norm": 0.5083035826683044, "kl": NaN, "learning_rate": 3.8043478260869567e-07, "loss": -0.0015, "num_tokens": 25674756.0, "reward": 0.9416667819023132, "reward_std": 0.24983328580856323, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 744 }, { "completion_length": 2217.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5314.0, "completions/mean_length": 2766.58349609375, "completions/mean_terminated_length": 2419.091064453125, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.2527137042062415, "frac_reward_zero_std": 0.5, "grad_norm": 0.5263506174087524, "kl": NaN, "learning_rate": 3.8026224982746717e-07, "loss": -0.0127, "num_tokens": 25710984.0, "reward": 0.8875001668930054, "reward_std": 0.23438750207424164, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 745 }, { "completion_length": 2103.3334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4934.0, "completions/mean_length": 2652.416748046875, "completions/mean_terminated_length": 2294.54541015625, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.2530529172320217, "frac_reward_zero_std": 0.5, "grad_norm": 0.532516360282898, "kl": NaN, "learning_rate": 3.800897170462388e-07, "loss": -0.0576, "num_tokens": 25749766.0, "reward": 0.8083333969116211, "reward_std": 0.23327383399009705, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 746 }, { "completion_length": 2240.8333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6402.0, "completions/mean_length": 4437.1669921875, "completions/mean_terminated_length": 3361.25, "completions/min_length": 1404.0, "completions/min_terminated_length": 1404.0, "epoch": 0.2533921302578019, "frac_reward_zero_std": 0.0, "grad_norm": 0.7276754379272461, "kl": NaN, "learning_rate": 3.7991718426501033e-07, "loss": -0.0838, "num_tokens": 25790936.0, "reward": 0.6666666865348816, "reward_std": 0.4954916834831238, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 747 }, { "completion_length": 935.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 935.75, "completions/mean_terminated_length": 935.75, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.2537313432835821, "frac_reward_zero_std": 1.0, "grad_norm": 9.218993568538281e-08, "kl": 0.0, "learning_rate": 3.797446514837819e-07, "loss": 0.0, "num_tokens": 25813547.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 748 }, { "completion_length": 1297.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3254.0, "completions/max_terminated_length": 3254.0, "completions/mean_length": 1297.8333740234375, "completions/mean_terminated_length": 1297.8333740234375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.25407055630936226, "frac_reward_zero_std": 0.5, "grad_norm": 0.04760192334651947, "kl": 0.0, "learning_rate": 3.7957211870255344e-07, "loss": 0.0017, "num_tokens": 25845291.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 749 }, { "completion_length": 3342.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5745.0, "completions/max_terminated_length": 5745.0, "completions/mean_length": 3342.08349609375, "completions/mean_terminated_length": 3342.08349609375, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 0.2544097693351425, "frac_reward_zero_std": 0.5, "grad_norm": 0.1283939778804779, "kl": 0.0, "learning_rate": 3.7939958592132504e-07, "loss": 0.0042, "num_tokens": 25900642.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 750 }, { "completion_length": 921.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 921.5, "completions/mean_terminated_length": 921.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.25474898236092264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.792270531400966e-07, "loss": 0.0, "num_tokens": 25921246.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 751 }, { "completion_length": 2476.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3929.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 2476.5, "completions/mean_terminated_length": 2476.5, "completions/min_length": 1234.0, "completions/min_terminated_length": 1234.0, "epoch": 0.25508819538670285, "frac_reward_zero_std": 0.0, "grad_norm": 0.14889118075370789, "kl": 0.0, "learning_rate": 3.7905452035886815e-07, "loss": -0.0032, "num_tokens": 25963354.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 752 }, { "completion_length": 1880.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4136.0, "completions/mean_length": 2429.416748046875, "completions/mean_terminated_length": 2051.272705078125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.255427408412483, "frac_reward_zero_std": 0.0, "grad_norm": 0.7829989194869995, "kl": NaN, "learning_rate": 3.7888198757763975e-07, "loss": -0.0045, "num_tokens": 26000126.0, "reward": 0.9750000834465027, "reward_std": 0.36095842719078064, "rewards/correctness_reward_func/mean": 0.7000000476837158, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 753 }, { "completion_length": 1247.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4548.0, "completions/max_terminated_length": 4548.0, "completions/mean_length": 1247.416748046875, "completions/mean_terminated_length": 1247.416748046875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.25576662143826323, "frac_reward_zero_std": 1.0, "grad_norm": 2.129471994294363e-07, "kl": 0.0, "learning_rate": 3.787094547964113e-07, "loss": 0.0, "num_tokens": 26026249.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 754 }, { "completion_length": 1251.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4579.0, "completions/mean_length": 2350.08349609375, "completions/mean_terminated_length": 1502.300048828125, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.25610583446404345, "frac_reward_zero_std": 0.5, "grad_norm": 0.3859163224697113, "kl": NaN, "learning_rate": 3.785369220151829e-07, "loss": -0.0551, "num_tokens": 26054394.0, "reward": 1.0833332538604736, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 755 }, { "completion_length": 1971.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5072.0, "completions/max_terminated_length": 5072.0, "completions/mean_length": 1971.916748046875, "completions/mean_terminated_length": 1971.916748046875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.2564450474898236, "frac_reward_zero_std": 0.5, "grad_norm": 0.13238492608070374, "kl": 0.0, "learning_rate": 3.783643892339544e-07, "loss": -0.0047, "num_tokens": 26091593.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 756 }, { "completion_length": 1132.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1132.166748046875, "completions/mean_terminated_length": 1132.166748046875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.2567842605156038, "frac_reward_zero_std": 0.5, "grad_norm": 0.04667339101433754, "kl": 0.0, "learning_rate": 3.78191856452726e-07, "loss": -0.0002, "num_tokens": 26114833.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 757 }, { "completion_length": 3478.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5985.0, "completions/mean_length": 4027.166748046875, "completions/mean_terminated_length": 3794.27294921875, "completions/min_length": 2198.0, "completions/min_terminated_length": 2198.0, "epoch": 0.257123473541384, "frac_reward_zero_std": 0.5, "grad_norm": 0.39453667402267456, "kl": NaN, "learning_rate": 3.7801932367149757e-07, "loss": -0.0182, "num_tokens": 26167190.0, "reward": 0.7416667938232422, "reward_std": 0.1855173110961914, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 758 }, { "completion_length": 1258.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1258.3333740234375, "completions/mean_terminated_length": 1258.3333740234375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.2574626865671642, "frac_reward_zero_std": 1.0, "grad_norm": 1.669351519240081e-07, "kl": 0.0, "learning_rate": 3.778467908902691e-07, "loss": 0.0, "num_tokens": 26195550.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 759 }, { "completion_length": 708.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 708.9166870117188, "completions/mean_terminated_length": 708.9166870117188, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.25780189959294436, "frac_reward_zero_std": 0.5, "grad_norm": 0.04684539511799812, "kl": 0.0, "learning_rate": 3.776742581090407e-07, "loss": -0.0, "num_tokens": 26213207.0, "reward": 0.7875000834465027, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 760 }, { "completion_length": 869.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 869.0, "completions/mean_terminated_length": 869.0, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.2581411126187246, "frac_reward_zero_std": 0.5, "grad_norm": 0.07464686036109924, "kl": 0.0, "learning_rate": 3.775017253278123e-07, "loss": 0.0001, "num_tokens": 26235623.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 761 }, { "completion_length": 1791.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5793.0, "completions/mean_length": 2340.75, "completions/mean_terminated_length": 1954.5455322265625, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.25848032564450474, "frac_reward_zero_std": 0.0, "grad_norm": 0.49990057945251465, "kl": NaN, "learning_rate": 3.7732919254658383e-07, "loss": -0.0433, "num_tokens": 26269273.0, "reward": 1.0250000953674316, "reward_std": 0.2906580865383148, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 762 }, { "completion_length": 1257.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1257.666748046875, "completions/mean_terminated_length": 1257.666748046875, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.25881953867028495, "frac_reward_zero_std": 1.0, "grad_norm": 9.947257240128238e-08, "kl": 0.0, "learning_rate": 3.771566597653554e-07, "loss": 0.0, "num_tokens": 26291937.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 763 }, { "completion_length": 2138.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4321.0, "completions/mean_length": 3785.75, "completions/mean_terminated_length": 2851.333251953125, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "epoch": 0.2591587516960651, "frac_reward_zero_std": 0.0, "grad_norm": 1.0325132608413696, "kl": NaN, "learning_rate": 3.7698412698412694e-07, "loss": -0.0504, "num_tokens": 26331945.0, "reward": 0.7416666746139526, "reward_std": 0.4976527690887451, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 764 }, { "completion_length": 847.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 847.9166870117188, "completions/mean_terminated_length": 847.9166870117188, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.25949796472184533, "frac_reward_zero_std": 1.0, "grad_norm": 1.0442206388461273e-07, "kl": 0.0, "learning_rate": 3.7681159420289855e-07, "loss": 0.0, "num_tokens": 26353748.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 765 }, { "completion_length": 719.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 719.1666870117188, "completions/mean_terminated_length": 719.1666870117188, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.2598371777476255, "frac_reward_zero_std": 0.5, "grad_norm": 0.2788970172405243, "kl": 0.0, "learning_rate": 3.766390614216701e-07, "loss": -0.0009, "num_tokens": 26376988.0, "reward": 0.6708333492279053, "reward_std": 0.22383961081504822, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 766 }, { "completion_length": 1171.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1171.916748046875, "completions/mean_terminated_length": 1171.916748046875, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.2601763907734057, "frac_reward_zero_std": 1.0, "grad_norm": 2.0947449286268238e-07, "kl": 0.0, "learning_rate": 3.7646652864044165e-07, "loss": 0.0, "num_tokens": 26401281.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 767 }, { "completion_length": 2836.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5037.0, "completions/max_terminated_length": 5037.0, "completions/mean_length": 2836.166748046875, "completions/mean_terminated_length": 2836.166748046875, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "epoch": 0.26051560379918587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7629399585921326e-07, "loss": 0.0, "num_tokens": 26448257.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 768 }, { "completion_length": 1207.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1207.416748046875, "completions/mean_terminated_length": 1207.416748046875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.2608548168249661, "frac_reward_zero_std": 0.5, "grad_norm": 0.06325611472129822, "kl": 0.0, "learning_rate": 3.761214630779848e-07, "loss": 0.0005, "num_tokens": 26474326.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 769 }, { "completion_length": 2223.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5250.0, "completions/mean_length": 2772.25, "completions/mean_terminated_length": 2425.272705078125, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 0.26119402985074625, "frac_reward_zero_std": 0.5, "grad_norm": 0.499055415391922, "kl": NaN, "learning_rate": 3.7594893029675636e-07, "loss": -0.0143, "num_tokens": 26511738.0, "reward": 0.9083334803581238, "reward_std": 0.2905454635620117, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 770 }, { "completion_length": 2154.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5527.0, "completions/max_terminated_length": 5527.0, "completions/mean_length": 2154.83349609375, "completions/mean_terminated_length": 2154.83349609375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.26153324287652646, "frac_reward_zero_std": 0.0, "grad_norm": 0.16773566603660583, "kl": 0.0, "learning_rate": 3.757763975155279e-07, "loss": 0.0018, "num_tokens": 26548642.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 771 }, { "completion_length": 502.16668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 502.16668701171875, "completions/mean_terminated_length": 502.16668701171875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.2618724559023066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.756038647342995e-07, "loss": 0.0, "num_tokens": 26566974.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 772 }, { "completion_length": 684.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 684.4166870117188, "completions/mean_terminated_length": 684.4166870117188, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.26221166892808684, "frac_reward_zero_std": 0.5, "grad_norm": 0.07578389346599579, "kl": 0.0, "learning_rate": 3.754313319530711e-07, "loss": -0.0004, "num_tokens": 26585555.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 773 }, { "completion_length": 1621.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3594.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 1621.8333740234375, "completions/mean_terminated_length": 1621.8333740234375, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.26255088195386705, "frac_reward_zero_std": 1.0, "grad_norm": 2.3733241505397018e-07, "kl": 0.0, "learning_rate": 3.7525879917184263e-07, "loss": 0.0, "num_tokens": 26615775.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 774 }, { "completion_length": 1442.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4935.0, "completions/max_terminated_length": 4935.0, "completions/mean_length": 1442.166748046875, "completions/mean_terminated_length": 1442.166748046875, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.2628900949796472, "frac_reward_zero_std": 0.5, "grad_norm": 0.1121632531285286, "kl": 0.0, "learning_rate": 3.750862663906142e-07, "loss": -0.0074, "num_tokens": 26647673.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 775 }, { "completion_length": 520.5000152587891, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 520.5, "completions/mean_terminated_length": 520.5, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.26322930800542743, "frac_reward_zero_std": 0.5, "grad_norm": 0.2791745960712433, "kl": 0.0, "learning_rate": 3.749137336093858e-07, "loss": -0.0005, "num_tokens": 26665733.0, "reward": 1.1041667461395264, "reward_std": 0.2002602219581604, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 776 }, { "completion_length": 702.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 702.0, "completions/mean_terminated_length": 702.0, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.2635685210312076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7474120082815734e-07, "loss": 0.0, "num_tokens": 26684873.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 777 }, { "completion_length": 1498.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1498.75, "completions/mean_terminated_length": 1498.75, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.2639077340569878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.745686680469289e-07, "loss": 0.0, "num_tokens": 26715134.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 778 }, { "completion_length": 1654.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5809.0, "completions/mean_length": 3301.416748046875, "completions/mean_terminated_length": 2205.5556640625, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.26424694708276797, "frac_reward_zero_std": 0.5, "grad_norm": 0.9202248454093933, "kl": NaN, "learning_rate": 3.7439613526570044e-07, "loss": -0.0862, "num_tokens": 26745628.0, "reward": 0.8583333492279053, "reward_std": 0.27095508575439453, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 779 }, { "completion_length": 1470.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3567.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 1470.75, "completions/mean_terminated_length": 1470.75, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.2645861601085482, "frac_reward_zero_std": 0.5, "grad_norm": 0.10494286566972733, "kl": 0.0, "learning_rate": 3.7422360248447205e-07, "loss": -0.0021, "num_tokens": 26774809.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 780 }, { "completion_length": 2154.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4803.0, "completions/max_terminated_length": 4803.0, "completions/mean_length": 2154.166748046875, "completions/mean_terminated_length": 2154.166748046875, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 0.26492537313432835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.740510697032436e-07, "loss": 0.0, "num_tokens": 26810025.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 781 }, { "completion_length": 1063.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1063.25, "completions/mean_terminated_length": 1063.25, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.26526458616010856, "frac_reward_zero_std": 0.5, "grad_norm": 0.11350041627883911, "kl": 0.0, "learning_rate": 3.7387853692201516e-07, "loss": -0.0009, "num_tokens": 26833140.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 782 }, { "completion_length": 1292.5833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6559.0, "completions/mean_length": 3488.916748046875, "completions/mean_terminated_length": 1938.875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.2656037991858887, "frac_reward_zero_std": 0.5, "grad_norm": 1.5954703092575073, "kl": NaN, "learning_rate": 3.737060041407867e-07, "loss": -0.1164, "num_tokens": 26860345.0, "reward": 0.7333334684371948, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 783 }, { "completion_length": 2248.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4318.0, "completions/max_terminated_length": 4318.0, "completions/mean_length": 2248.916748046875, "completions/mean_terminated_length": 2248.916748046875, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 0.26594301221166894, "frac_reward_zero_std": 0.5, "grad_norm": 0.35711029171943665, "kl": 0.0, "learning_rate": 3.735334713595583e-07, "loss": -0.0026, "num_tokens": 26896800.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 784 }, { "completion_length": 1661.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 1661.5833740234375, "completions/mean_terminated_length": 1661.5833740234375, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.2662822252374491, "frac_reward_zero_std": 0.5, "grad_norm": 0.15416787564754486, "kl": 0.0, "learning_rate": 3.7336093857832987e-07, "loss": -0.0007, "num_tokens": 26927623.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 785 }, { "completion_length": 779.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 779.75, "completions/mean_terminated_length": 779.75, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.2666214382632293, "frac_reward_zero_std": 0.5, "grad_norm": 0.07597201317548752, "kl": 0.0, "learning_rate": 3.731884057971014e-07, "loss": 0.0004, "num_tokens": 26947360.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 786 }, { "completion_length": 2350.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4980.0, "completions/max_terminated_length": 4980.0, "completions/mean_length": 2350.83349609375, "completions/mean_terminated_length": 2350.83349609375, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "epoch": 0.2669606512890095, "frac_reward_zero_std": 0.0, "grad_norm": 0.6928236484527588, "kl": 0.0, "learning_rate": 3.73015873015873e-07, "loss": 0.0143, "num_tokens": 26985722.0, "reward": 1.070833444595337, "reward_std": 0.2486901879310608, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 787 }, { "completion_length": 1690.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4178.0, "completions/max_terminated_length": 4178.0, "completions/mean_length": 1690.166748046875, "completions/mean_terminated_length": 1690.166748046875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.2672998643147897, "frac_reward_zero_std": 0.5, "grad_norm": 0.48641929030418396, "kl": 0.0, "learning_rate": 3.728433402346446e-07, "loss": 0.0025, "num_tokens": 27017704.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.3113996088504791, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 788 }, { "completion_length": 721.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 721.75, "completions/mean_terminated_length": 721.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.26763907734056985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7267080745341613e-07, "loss": 0.0, "num_tokens": 27044245.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 789 }, { "completion_length": 3101.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6048.0, "completions/mean_length": 3650.25, "completions/mean_terminated_length": 3383.091064453125, "completions/min_length": 2335.0, "completions/min_terminated_length": 2335.0, "epoch": 0.26797829036635007, "frac_reward_zero_std": 0.0, "grad_norm": 0.937369704246521, "kl": NaN, "learning_rate": 3.724982746721877e-07, "loss": -0.0396, "num_tokens": 27088401.0, "reward": 0.7041667699813843, "reward_std": 0.46168631315231323, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 790 }, { "completion_length": 2644.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6416.0, "completions/max_terminated_length": 6416.0, "completions/mean_length": 2644.08349609375, "completions/mean_terminated_length": 2644.08349609375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.2683175033921303, "frac_reward_zero_std": 0.5, "grad_norm": 0.6890131235122681, "kl": 0.0, "learning_rate": 3.723257418909593e-07, "loss": -0.0096, "num_tokens": 27136918.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 791 }, { "completion_length": 1909.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3999.0, "completions/max_terminated_length": 3999.0, "completions/mean_length": 1909.0833740234375, "completions/mean_terminated_length": 1909.0833740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.26865671641791045, "frac_reward_zero_std": 0.5, "grad_norm": 0.6697491407394409, "kl": 0.0, "learning_rate": 3.7215320910973084e-07, "loss": 0.0026, "num_tokens": 27169673.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 792 }, { "completion_length": 594.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 594.4166870117188, "completions/mean_terminated_length": 594.4166870117188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.26899592944369066, "frac_reward_zero_std": 0.5, "grad_norm": 0.06149730086326599, "kl": 0.0, "learning_rate": 3.719806763285024e-07, "loss": 0.0001, "num_tokens": 27189796.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 793 }, { "completion_length": 974.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3086.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 974.5, "completions/mean_terminated_length": 974.5, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.2693351424694708, "frac_reward_zero_std": 1.0, "grad_norm": 2.1048205667284492e-07, "kl": 0.0, "learning_rate": 3.7180814354727395e-07, "loss": 0.0, "num_tokens": 27212596.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 794 }, { "completion_length": 2349.83349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4858.0, "completions/mean_length": 2898.916748046875, "completions/mean_terminated_length": 2563.45458984375, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "epoch": 0.26967435549525104, "frac_reward_zero_std": 0.5, "grad_norm": 0.6481123566627502, "kl": NaN, "learning_rate": 3.7163561076604555e-07, "loss": -0.0263, "num_tokens": 27249230.0, "reward": 0.770833432674408, "reward_std": 0.2123773992061615, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 795 }, { "completion_length": 1115.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 1115.75, "completions/mean_terminated_length": 1115.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.2700135685210312, "frac_reward_zero_std": 0.0, "grad_norm": 0.08609779179096222, "kl": 0.0, "learning_rate": 3.7146307798481705e-07, "loss": 0.0008, "num_tokens": 27273443.0, "reward": 1.2708332538604736, "reward_std": 0.07144343107938766, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 796 }, { "completion_length": 1959.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3710.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 1959.416748046875, "completions/mean_terminated_length": 1959.416748046875, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.2703527815468114, "frac_reward_zero_std": 0.0, "grad_norm": 0.7615066766738892, "kl": 0.0, "learning_rate": 3.7129054520358866e-07, "loss": 0.0246, "num_tokens": 27307120.0, "reward": 0.7666666507720947, "reward_std": 0.4647580087184906, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 797 }, { "completion_length": 858.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 858.9166870117188, "completions/mean_terminated_length": 858.9166870117188, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.2706919945725916, "frac_reward_zero_std": 0.5, "grad_norm": 0.3698106110095978, "kl": 0.0, "learning_rate": 3.711180124223602e-07, "loss": 0.0015, "num_tokens": 27332229.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 798 }, { "completion_length": 2415.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4826.0, "completions/max_terminated_length": 4826.0, "completions/mean_length": 2415.666748046875, "completions/mean_terminated_length": 2415.666748046875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.2710312075983718, "frac_reward_zero_std": 0.5, "grad_norm": 0.6135579347610474, "kl": 0.0, "learning_rate": 3.709454796411318e-07, "loss": 0.0115, "num_tokens": 27375395.0, "reward": 1.1708333492279053, "reward_std": 0.22383961081504822, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 799 }, { "completion_length": 1509.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4804.0, "completions/max_terminated_length": 4804.0, "completions/mean_length": 1509.916748046875, "completions/mean_terminated_length": 1509.916748046875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.27137042062415195, "frac_reward_zero_std": 1.0, "grad_norm": 1.8519099853619991e-07, "kl": 0.0, "learning_rate": 3.7077294685990337e-07, "loss": 0.0, "num_tokens": 27403858.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 800 }, { "completion_length": 1219.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 1219.166748046875, "completions/mean_terminated_length": 1219.166748046875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.27170963364993217, "frac_reward_zero_std": 0.5, "grad_norm": 0.10737733542919159, "kl": 0.0, "learning_rate": 3.706004140786749e-07, "loss": 0.0009, "num_tokens": 27433344.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 801 }, { "completion_length": 606.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 606.6666870117188, "completions/mean_terminated_length": 606.6666870117188, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.27204884667571233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7042788129744653e-07, "loss": 0.0, "num_tokens": 27451136.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 802 }, { "completion_length": 1556.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 1556.5, "completions/mean_terminated_length": 1556.5, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.27238805970149255, "frac_reward_zero_std": 0.5, "grad_norm": 0.5762483477592468, "kl": 0.0, "learning_rate": 3.702553485162181e-07, "loss": 0.0254, "num_tokens": 27480152.0, "reward": 1.1208332777023315, "reward_std": 0.27857524156570435, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 803 }, { "completion_length": 1405.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1405.0, "completions/mean_terminated_length": 1405.0, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.2727272727272727, "frac_reward_zero_std": 0.5, "grad_norm": 0.4991465210914612, "kl": 0.0, "learning_rate": 3.7008281573498964e-07, "loss": -0.0007, "num_tokens": 27506474.0, "reward": 0.46666666865348816, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 804 }, { "completion_length": 1961.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3442.0, "completions/max_terminated_length": 3442.0, "completions/mean_length": 1961.3333740234375, "completions/mean_terminated_length": 1961.3333740234375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 0.2730664857530529, "frac_reward_zero_std": 1.0, "grad_norm": 1.4712648521708616e-07, "kl": 0.0, "learning_rate": 3.699102829537612e-07, "loss": 0.0, "num_tokens": 27545088.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 805 }, { "completion_length": 1769.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3968.0, "completions/max_terminated_length": 3968.0, "completions/mean_length": 1769.0, "completions/mean_terminated_length": 1769.0, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.2734056987788331, "frac_reward_zero_std": 0.0, "grad_norm": 0.11471240222454071, "kl": 0.0, "learning_rate": 3.697377501725328e-07, "loss": -0.0003, "num_tokens": 27581796.0, "reward": 1.1666667461395264, "reward_std": 0.09559707343578339, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 806 }, { "completion_length": 1644.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1644.75, "completions/mean_terminated_length": 1644.75, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.2737449118046133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.695652173913043e-07, "loss": 0.0, "num_tokens": 27614289.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 807 }, { "completion_length": 1305.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3489.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 1305.0833740234375, "completions/mean_terminated_length": 1305.0833740234375, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.27408412483039346, "frac_reward_zero_std": 0.5, "grad_norm": 0.5795224905014038, "kl": 0.0, "learning_rate": 3.693926846100759e-07, "loss": 0.0063, "num_tokens": 27644182.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 808 }, { "completion_length": 1074.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 1074.166748046875, "completions/mean_terminated_length": 1074.166748046875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.2744233378561737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6922015182884745e-07, "loss": 0.0, "num_tokens": 27668220.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 809 }, { "completion_length": 974.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 974.9166870117188, "completions/mean_terminated_length": 974.9166870117188, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.2747625508819539, "frac_reward_zero_std": 0.5, "grad_norm": 0.06648023426532745, "kl": 0.0, "learning_rate": 3.6904761904761906e-07, "loss": -0.0, "num_tokens": 27695639.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 810 }, { "completion_length": 1228.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1228.666748046875, "completions/mean_terminated_length": 1228.666748046875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.27510176390773405, "frac_reward_zero_std": 0.0, "grad_norm": 0.3744871914386749, "kl": 0.0, "learning_rate": 3.6887508626639056e-07, "loss": 0.0064, "num_tokens": 27720211.0, "reward": 1.1166667938232422, "reward_std": 0.24096208810806274, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 811 }, { "completion_length": 1277.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1277.75, "completions/mean_terminated_length": 1277.75, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.27544097693351427, "frac_reward_zero_std": 0.5, "grad_norm": 0.08306435495615005, "kl": 0.0, "learning_rate": 3.6870255348516216e-07, "loss": 0.0017, "num_tokens": 27745120.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 812 }, { "completion_length": 1650.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3301.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 1650.916748046875, "completions/mean_terminated_length": 1650.916748046875, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.27578018995929443, "frac_reward_zero_std": 0.0, "grad_norm": 0.23837661743164062, "kl": 0.0, "learning_rate": 3.685300207039337e-07, "loss": 0.0016, "num_tokens": 27777681.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 813 }, { "completion_length": 703.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 703.1666870117188, "completions/mean_terminated_length": 703.1666870117188, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.27611940298507465, "frac_reward_zero_std": 0.5, "grad_norm": 0.1073731854557991, "kl": 0.0, "learning_rate": 3.683574879227053e-07, "loss": -0.0005, "num_tokens": 27798101.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 814 }, { "completion_length": 1182.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 1182.75, "completions/mean_terminated_length": 1182.75, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.2764586160108548, "frac_reward_zero_std": 0.5, "grad_norm": 0.0674026682972908, "kl": 0.0, "learning_rate": 3.681849551414769e-07, "loss": 0.0014, "num_tokens": 27818804.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 815 }, { "completion_length": 1002.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 1002.5833740234375, "completions/mean_terminated_length": 1002.5833740234375, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 1.705409573560246e-07, "kl": 0.0, "learning_rate": 3.6801242236024843e-07, "loss": 0.0, "num_tokens": 27847707.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 816 }, { "completion_length": 2383.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6308.0, "completions/max_terminated_length": 6308.0, "completions/mean_length": 2383.0, "completions/mean_terminated_length": 2383.0, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.2771370420624152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6783988957902003e-07, "loss": 0.0, "num_tokens": 27886677.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 817 }, { "completion_length": 2868.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6046.0, "completions/max_terminated_length": 6046.0, "completions/mean_length": 2868.666748046875, "completions/mean_terminated_length": 2868.666748046875, "completions/min_length": 1894.0, "completions/min_terminated_length": 1894.0, "epoch": 0.2774762550881954, "frac_reward_zero_std": 0.5, "grad_norm": 0.09324238449335098, "kl": 0.0, "learning_rate": 3.6766735679779153e-07, "loss": -0.0024, "num_tokens": 27936599.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 818 }, { "completion_length": 1085.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 1085.0833740234375, "completions/mean_terminated_length": 1085.0833740234375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.27781546811397556, "frac_reward_zero_std": 1.0, "grad_norm": 1.5154705579334404e-07, "kl": 0.0, "learning_rate": 3.6749482401656314e-07, "loss": 0.0, "num_tokens": 27962658.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 819 }, { "completion_length": 2202.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5019.0, "completions/max_terminated_length": 5019.0, "completions/mean_length": 2202.25, "completions/mean_terminated_length": 2202.25, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.2781546811397558, "frac_reward_zero_std": 0.0, "grad_norm": 0.5000991821289062, "kl": 0.0, "learning_rate": 3.673222912353347e-07, "loss": 0.0171, "num_tokens": 28000857.0, "reward": 0.8208333849906921, "reward_std": 0.23816029727458954, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 820 }, { "completion_length": 3302.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6512.0, "completions/mean_length": 3852.0, "completions/mean_terminated_length": 3603.181884765625, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 0.27849389416553594, "frac_reward_zero_std": 0.5, "grad_norm": 0.2379552721977234, "kl": NaN, "learning_rate": 3.671497584541063e-07, "loss": -0.0582, "num_tokens": 28052630.0, "reward": 1.120833396911621, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 821 }, { "completion_length": 2307.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3861.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 2307.25, "completions/mean_terminated_length": 2307.25, "completions/min_length": 1338.0, "completions/min_terminated_length": 1338.0, "epoch": 0.27883310719131615, "frac_reward_zero_std": 0.0, "grad_norm": 0.6435486674308777, "kl": 0.0, "learning_rate": 3.669772256728778e-07, "loss": 0.0243, "num_tokens": 28093595.0, "reward": 0.8833333849906921, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 822 }, { "completion_length": 2714.2501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5474.0, "completions/mean_length": 3263.33349609375, "completions/mean_terminated_length": 2961.0, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.2791723202170963, "frac_reward_zero_std": 0.5, "grad_norm": 0.2697756886482239, "kl": NaN, "learning_rate": 3.668046928916494e-07, "loss": -0.0554, "num_tokens": 28140860.0, "reward": 1.0416667461395264, "reward_std": 0.2457980364561081, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 823 }, { "completion_length": 2003.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4538.0, "completions/max_terminated_length": 4538.0, "completions/mean_length": 2003.3333740234375, "completions/mean_terminated_length": 2003.3333740234375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.27951153324287653, "frac_reward_zero_std": 0.5, "grad_norm": 0.072422556579113, "kl": 0.0, "learning_rate": 3.6663216011042096e-07, "loss": -0.0009, "num_tokens": 28174818.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 824 }, { "completion_length": 1121.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 1121.5, "completions/mean_terminated_length": 1121.5, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.2798507462686567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6645962732919256e-07, "loss": 0.0, "num_tokens": 28197018.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 825 }, { "completion_length": 1077.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3488.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 1077.416748046875, "completions/mean_terminated_length": 1077.416748046875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.2801899592944369, "frac_reward_zero_std": 0.0, "grad_norm": 0.6790959239006042, "kl": 0.0, "learning_rate": 3.6628709454796406e-07, "loss": 0.0398, "num_tokens": 28224377.0, "reward": 0.9666666984558105, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.6666666269302368, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 826 }, { "completion_length": 690.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 690.6666870117188, "completions/mean_terminated_length": 690.6666870117188, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.2805291723202171, "frac_reward_zero_std": 1.0, "grad_norm": 1.7838985399976082e-07, "kl": 0.0, "learning_rate": 3.6611456176673567e-07, "loss": 0.0, "num_tokens": 28246261.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 827 }, { "completion_length": 2305.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6477.0, "completions/max_terminated_length": 6477.0, "completions/mean_length": 2305.416748046875, "completions/mean_terminated_length": 2305.416748046875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.2808683853459973, "frac_reward_zero_std": 0.0, "grad_norm": 0.897094190120697, "kl": 0.0, "learning_rate": 3.659420289855072e-07, "loss": -0.0296, "num_tokens": 28284372.0, "reward": 0.8000000715255737, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 828 }, { "completion_length": 3132.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6016.0, "completions/mean_length": 4780.1669921875, "completions/mean_terminated_length": 4177.22216796875, "completions/min_length": 2212.0, "completions/min_terminated_length": 2212.0, "epoch": 0.2812075983717775, "frac_reward_zero_std": 0.0, "grad_norm": 0.5745806097984314, "kl": NaN, "learning_rate": 3.657694962042788e-07, "loss": -0.0817, "num_tokens": 28334783.0, "reward": 0.9000000953674316, "reward_std": 0.3510836958885193, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 829 }, { "completion_length": 1060.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 1060.5833740234375, "completions/mean_terminated_length": 1060.5833740234375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.28154681139755766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6559696342305033e-07, "loss": 0.0, "num_tokens": 28357872.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 830 }, { "completion_length": 1614.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3408.0, "completions/max_terminated_length": 3408.0, "completions/mean_length": 1614.5833740234375, "completions/mean_terminated_length": 1614.5833740234375, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.2818860244233379, "frac_reward_zero_std": 0.5, "grad_norm": 0.05196267366409302, "kl": 0.0, "learning_rate": 3.6542443064182193e-07, "loss": -0.0006, "num_tokens": 28387807.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 831 }, { "completion_length": 1116.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 2763.75, "completions/mean_terminated_length": 1488.6666259765625, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.28222523744911804, "frac_reward_zero_std": 0.5, "grad_norm": 1.0311174392700195, "kl": NaN, "learning_rate": 3.6525189786059354e-07, "loss": -0.0566, "num_tokens": 28413589.0, "reward": 0.4749999940395355, "reward_std": 0.35601967573165894, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 832 }, { "completion_length": 905.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 905.0, "completions/mean_terminated_length": 905.0, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.28256445047489825, "frac_reward_zero_std": 1.0, "grad_norm": 2.9492409225895244e-07, "kl": 0.0, "learning_rate": 3.6507936507936504e-07, "loss": 0.0, "num_tokens": 28437493.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 833 }, { "completion_length": 1761.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3188.0, "completions/max_terminated_length": 3188.0, "completions/mean_length": 1761.25, "completions/mean_terminated_length": 1761.25, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.2829036635006784, "frac_reward_zero_std": 0.5, "grad_norm": 0.07774775475263596, "kl": 0.0, "learning_rate": 3.6490683229813664e-07, "loss": -0.0002, "num_tokens": 28470862.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 834 }, { "completion_length": 758.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 758.8333740234375, "completions/mean_terminated_length": 758.8333740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.28324287652645863, "frac_reward_zero_std": 1.0, "grad_norm": 1.0557556606727303e-07, "kl": 0.0, "learning_rate": 3.647342995169082e-07, "loss": 0.0, "num_tokens": 28494248.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 835 }, { "completion_length": 2812.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 6122.0, "completions/max_terminated_length": 6122.0, "completions/mean_length": 2812.08349609375, "completions/mean_terminated_length": 2812.08349609375, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.2835820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.5875731706619263, "kl": 0.0, "learning_rate": 3.645617667356798e-07, "loss": 0.0046, "num_tokens": 28534869.0, "reward": 0.9541667699813843, "reward_std": 0.19900795817375183, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 836 }, { "completion_length": 1912.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5484.0, "completions/max_terminated_length": 5484.0, "completions/mean_length": 1912.25, "completions/mean_terminated_length": 1912.25, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.283921302578019, "frac_reward_zero_std": 1.0, "grad_norm": 1.538487879315653e-07, "kl": 0.0, "learning_rate": 3.643892339544513e-07, "loss": 0.0, "num_tokens": 28567734.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 837 }, { "completion_length": 942.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 942.0, "completions/mean_terminated_length": 942.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.28426051560379917, "frac_reward_zero_std": 0.5, "grad_norm": 0.08199096471071243, "kl": 0.0, "learning_rate": 3.642167011732229e-07, "loss": 0.0005, "num_tokens": 28595202.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 838 }, { "completion_length": 2428.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4072.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 2428.916748046875, "completions/mean_terminated_length": 2428.916748046875, "completions/min_length": 1415.0, "completions/min_terminated_length": 1415.0, "epoch": 0.2845997286295794, "frac_reward_zero_std": 0.5, "grad_norm": 0.8435646891593933, "kl": 0.0, "learning_rate": 3.6404416839199446e-07, "loss": 0.0061, "num_tokens": 28631879.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 839 }, { "completion_length": 1648.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5064.0, "completions/max_terminated_length": 5064.0, "completions/mean_length": 1648.666748046875, "completions/mean_terminated_length": 1648.666748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.28493894165535955, "frac_reward_zero_std": 0.5, "grad_norm": 0.09471841156482697, "kl": 0.0, "learning_rate": 3.63871635610766e-07, "loss": -0.0025, "num_tokens": 28661653.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 840 }, { "completion_length": 1834.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 1834.8333740234375, "completions/mean_terminated_length": 1834.8333740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.28527815468113976, "frac_reward_zero_std": 0.5, "grad_norm": 0.0629253163933754, "kl": 0.0, "learning_rate": 3.6369910282953757e-07, "loss": 0.0021, "num_tokens": 28697795.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 841 }, { "completion_length": 1036.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1036.166748046875, "completions/mean_terminated_length": 1036.166748046875, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.2856173677069199, "frac_reward_zero_std": 0.5, "grad_norm": 0.09502989798784256, "kl": 0.0, "learning_rate": 3.6352657004830917e-07, "loss": -0.0, "num_tokens": 28721341.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 842 }, { "completion_length": 898.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 898.9166870117188, "completions/mean_terminated_length": 898.9166870117188, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.28595658073270014, "frac_reward_zero_std": 1.0, "grad_norm": 9.972598036256386e-08, "kl": 0.0, "learning_rate": 3.633540372670807e-07, "loss": 0.0, "num_tokens": 28743330.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 843 }, { "completion_length": 1803.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5706.0, "completions/max_terminated_length": 5706.0, "completions/mean_length": 1803.75, "completions/mean_terminated_length": 1803.75, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2862957937584803, "frac_reward_zero_std": 0.5, "grad_norm": 0.4343622028827667, "kl": 0.0, "learning_rate": 3.631815044858523e-07, "loss": 0.0283, "num_tokens": 28779027.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 844 }, { "completion_length": 1681.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 1681.75, "completions/mean_terminated_length": 1681.75, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 0.2866350067842605, "frac_reward_zero_std": 0.5, "grad_norm": 0.07720570266246796, "kl": 0.0, "learning_rate": 3.6300897170462383e-07, "loss": -0.0016, "num_tokens": 28814628.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 845 }, { "completion_length": 1228.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 1228.666748046875, "completions/mean_terminated_length": 1228.666748046875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.28697421981004073, "frac_reward_zero_std": 1.0, "grad_norm": 1.1242604358585595e-07, "kl": 0.0, "learning_rate": 3.6283643892339544e-07, "loss": 0.0, "num_tokens": 28844198.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 846 }, { "completion_length": 2611.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6019.0, "completions/mean_length": 3709.83349609375, "completions/mean_terminated_length": 3134.0, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "epoch": 0.2873134328358209, "frac_reward_zero_std": 0.0, "grad_norm": 0.6555944085121155, "kl": NaN, "learning_rate": 3.6266390614216704e-07, "loss": -0.0615, "num_tokens": 28886206.0, "reward": 0.7500001192092896, "reward_std": 0.280963659286499, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 847 }, { "completion_length": 774.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 774.75, "completions/mean_terminated_length": 774.75, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2876526458616011, "frac_reward_zero_std": 1.0, "grad_norm": 1.7893466974783223e-07, "kl": 0.0, "learning_rate": 3.6249137336093854e-07, "loss": 0.0, "num_tokens": 28903849.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 848 }, { "completion_length": 2093.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4633.0, "completions/max_terminated_length": 4633.0, "completions/mean_length": 2093.25, "completions/mean_terminated_length": 2093.25, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.28799185888738127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6231884057971015e-07, "loss": 0.0, "num_tokens": 28938424.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 849 }, { "completion_length": 2106.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4592.0, "completions/mean_length": 2655.83349609375, "completions/mean_terminated_length": 2298.272705078125, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.2883310719131615, "frac_reward_zero_std": 0.5, "grad_norm": 0.1451241672039032, "kl": NaN, "learning_rate": 3.621463077984817e-07, "loss": -0.0101, "num_tokens": 28975171.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 850 }, { "completion_length": 1222.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1222.5833740234375, "completions/mean_terminated_length": 1222.5833740234375, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.28867028493894165, "frac_reward_zero_std": 1.0, "grad_norm": 1.7627006343445828e-07, "kl": 0.0, "learning_rate": 3.6197377501725325e-07, "loss": 0.0, "num_tokens": 29000672.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 851 }, { "completion_length": 1198.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3363.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 1198.0, "completions/mean_terminated_length": 1198.0, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.28900949796472186, "frac_reward_zero_std": 0.5, "grad_norm": 0.040780119597911835, "kl": 0.0, "learning_rate": 3.618012422360248e-07, "loss": -0.0002, "num_tokens": 29028500.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 852 }, { "completion_length": 3095.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5384.0, "completions/max_terminated_length": 5384.0, "completions/mean_length": 3095.416748046875, "completions/mean_terminated_length": 3095.416748046875, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 0.289348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.6323304772377014, "kl": 0.0, "learning_rate": 3.616287094547964e-07, "loss": -0.0108, "num_tokens": 29076835.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 853 }, { "completion_length": 938.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 938.5, "completions/mean_terminated_length": 938.5, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.28968792401628224, "frac_reward_zero_std": 1.0, "grad_norm": 9.98534375185045e-08, "kl": 0.0, "learning_rate": 3.6145617667356797e-07, "loss": 0.0, "num_tokens": 29098753.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 854 }, { "completion_length": 1426.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3073.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 1426.166748046875, "completions/mean_terminated_length": 1426.166748046875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.2900271370420624, "frac_reward_zero_std": 1.0, "grad_norm": 2.5459246444370365e-07, "kl": 0.0, "learning_rate": 3.612836438923395e-07, "loss": 0.0, "num_tokens": 29129841.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 855 }, { "completion_length": 833.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 833.8333740234375, "completions/mean_terminated_length": 833.8333740234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.2903663500678426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6111111111111107e-07, "loss": 0.0, "num_tokens": 29150641.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 856 }, { "completion_length": 1173.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/max_terminated_length": 3143.0, "completions/mean_length": 1173.166748046875, "completions/mean_terminated_length": 1173.166748046875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.2907055630936228, "frac_reward_zero_std": 0.5, "grad_norm": 0.09201031923294067, "kl": 0.0, "learning_rate": 3.609385783298827e-07, "loss": 0.0009, "num_tokens": 29175879.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 857 }, { "completion_length": 1476.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1476.416748046875, "completions/mean_terminated_length": 1476.416748046875, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.291044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 1.1626907081563331e-07, "kl": 0.0, "learning_rate": 3.6076604554865423e-07, "loss": 0.0, "num_tokens": 29205764.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 858 }, { "completion_length": 2051.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3636.0, "completions/max_terminated_length": 3636.0, "completions/mean_length": 2051.33349609375, "completions/mean_terminated_length": 2051.33349609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.29138398914518315, "frac_reward_zero_std": 0.5, "grad_norm": 0.4645165503025055, "kl": 0.0, "learning_rate": 3.605935127674258e-07, "loss": -0.0135, "num_tokens": 29241348.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 859 }, { "completion_length": 2124.3334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5150.0, "completions/mean_length": 2673.416748046875, "completions/mean_terminated_length": 2317.45458984375, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.29172320217096337, "frac_reward_zero_std": 0.5, "grad_norm": 0.17607992887496948, "kl": NaN, "learning_rate": 3.6042097998619734e-07, "loss": -0.0124, "num_tokens": 29280682.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 860 }, { "completion_length": 2468.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5737.0, "completions/max_terminated_length": 5737.0, "completions/mean_length": 2468.416748046875, "completions/mean_terminated_length": 2468.416748046875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.29206241519674353, "frac_reward_zero_std": 0.5, "grad_norm": 0.25238487124443054, "kl": 0.0, "learning_rate": 3.6024844720496894e-07, "loss": 0.0017, "num_tokens": 29323107.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 861 }, { "completion_length": 1086.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1086.0833740234375, "completions/mean_terminated_length": 1086.0833740234375, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.29240162822252375, "frac_reward_zero_std": 0.5, "grad_norm": 0.09536635130643845, "kl": 0.0, "learning_rate": 3.600759144237405e-07, "loss": -0.0001, "num_tokens": 29345398.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 862 }, { "completion_length": 2525.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4973.0, "completions/mean_length": 3074.166748046875, "completions/mean_terminated_length": 2754.636474609375, "completions/min_length": 1133.0, "completions/min_terminated_length": 1133.0, "epoch": 0.29274084124830396, "frac_reward_zero_std": 0.0, "grad_norm": 0.4000323414802551, "kl": NaN, "learning_rate": 3.5990338164251205e-07, "loss": -0.0039, "num_tokens": 29389211.0, "reward": 0.6416666507720947, "reward_std": 0.2474271059036255, "rewards/correctness_reward_func/mean": 0.36666664481163025, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 863 }, { "completion_length": 940.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 940.5833740234375, "completions/mean_terminated_length": 940.5833740234375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.2930800542740841, "frac_reward_zero_std": 0.5, "grad_norm": 0.42567434906959534, "kl": 0.0, "learning_rate": 3.5973084886128365e-07, "loss": 0.0036, "num_tokens": 29410548.0, "reward": 1.058333396911621, "reward_std": 0.2239791601896286, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 864 }, { "completion_length": 1378.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 1378.25, "completions/mean_terminated_length": 1378.25, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.29341926729986434, "frac_reward_zero_std": 0.5, "grad_norm": 0.08405417203903198, "kl": 0.0, "learning_rate": 3.595583160800552e-07, "loss": -0.0007, "num_tokens": 29439381.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 865 }, { "completion_length": 1817.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3688.0, "completions/max_terminated_length": 3688.0, "completions/mean_length": 1817.5, "completions/mean_terminated_length": 1817.5, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.2937584803256445, "frac_reward_zero_std": 1.0, "grad_norm": 2.4541046172998904e-07, "kl": 0.0, "learning_rate": 3.5938578329882676e-07, "loss": 0.0, "num_tokens": 29472459.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 866 }, { "completion_length": 1045.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3683.0, "completions/max_terminated_length": 3683.0, "completions/mean_length": 1045.75, "completions/mean_terminated_length": 1045.75, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.2940976933514247, "frac_reward_zero_std": 0.5, "grad_norm": 0.0929461270570755, "kl": 0.0, "learning_rate": 3.592132505175983e-07, "loss": -0.0029, "num_tokens": 29493606.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 867 }, { "completion_length": 1191.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 1191.0833740234375, "completions/mean_terminated_length": 1191.0833740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.2944369063772049, "frac_reward_zero_std": 0.5, "grad_norm": 0.08606947213411331, "kl": 0.0, "learning_rate": 3.590407177363699e-07, "loss": -0.0002, "num_tokens": 29520937.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 868 }, { "completion_length": 2072.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4117.0, "completions/max_terminated_length": 4117.0, "completions/mean_length": 2072.5, "completions/mean_terminated_length": 2072.5, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.2947761194029851, "frac_reward_zero_std": 0.0, "grad_norm": 0.15073060989379883, "kl": 0.0, "learning_rate": 3.5886818495514147e-07, "loss": -0.0003, "num_tokens": 29560627.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 869 }, { "completion_length": 1233.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 2332.08349609375, "completions/mean_terminated_length": 1480.7000732421875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.29511533242876525, "frac_reward_zero_std": 0.5, "grad_norm": 0.3028711974620819, "kl": NaN, "learning_rate": 3.58695652173913e-07, "loss": -0.0307, "num_tokens": 29584182.0, "reward": 1.0166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 870 }, { "completion_length": 1574.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6154.0, "completions/max_terminated_length": 6154.0, "completions/mean_length": 1574.416748046875, "completions/mean_terminated_length": 1574.416748046875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.29545454545454547, "frac_reward_zero_std": 0.0, "grad_norm": 0.661939263343811, "kl": 0.0, "learning_rate": 3.585231193926846e-07, "loss": 0.0737, "num_tokens": 29614547.0, "reward": 1.1166666746139526, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 871 }, { "completion_length": 1510.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3076.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 1510.666748046875, "completions/mean_terminated_length": 1510.666748046875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.29579375848032563, "frac_reward_zero_std": 0.5, "grad_norm": 0.5096197128295898, "kl": 0.0, "learning_rate": 3.583505866114562e-07, "loss": 0.0023, "num_tokens": 29647225.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 872 }, { "completion_length": 3703.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6485.0, "completions/mean_length": 4252.4169921875, "completions/mean_terminated_length": 4040.0, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "epoch": 0.29613297150610585, "frac_reward_zero_std": 0.5, "grad_norm": 0.2425873577594757, "kl": NaN, "learning_rate": 3.581780538302277e-07, "loss": -0.0457, "num_tokens": 29703071.0, "reward": 0.625, "reward_std": 0.23611438274383545, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 873 }, { "completion_length": 1448.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 1448.666748046875, "completions/mean_terminated_length": 1448.666748046875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.296472184531886, "frac_reward_zero_std": 0.5, "grad_norm": 0.0715579092502594, "kl": 0.0, "learning_rate": 3.580055210489993e-07, "loss": -0.0002, "num_tokens": 29735509.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 874 }, { "completion_length": 821.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 821.5, "completions/mean_terminated_length": 821.5, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.2968113975576662, "frac_reward_zero_std": 0.5, "grad_norm": 0.07250086963176727, "kl": 0.0, "learning_rate": 3.5783298826777084e-07, "loss": -0.0009, "num_tokens": 29759443.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 875 }, { "completion_length": 1254.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 1803.916748046875, "completions/mean_terminated_length": 1368.9091796875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.2971506105834464, "frac_reward_zero_std": 0.5, "grad_norm": 0.47821852564811707, "kl": NaN, "learning_rate": 3.5766045548654245e-07, "loss": -0.0038, "num_tokens": 29784725.0, "reward": 0.6083333492279053, "reward_std": 0.3006936311721802, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 876 }, { "completion_length": 1243.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1243.75, "completions/mean_terminated_length": 1243.75, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.2974898236092266, "frac_reward_zero_std": 0.0, "grad_norm": 0.5256324410438538, "kl": 0.0, "learning_rate": 3.5748792270531395e-07, "loss": -0.0066, "num_tokens": 29811872.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 877 }, { "completion_length": 1533.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5486.0, "completions/max_terminated_length": 5486.0, "completions/mean_length": 1533.0, "completions/mean_terminated_length": 1533.0, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.29782903663500676, "frac_reward_zero_std": 0.5, "grad_norm": 0.3232329785823822, "kl": 0.0, "learning_rate": 3.5731538992408555e-07, "loss": -0.0113, "num_tokens": 29842292.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 878 }, { "completion_length": 1707.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3730.0, "completions/max_terminated_length": 3730.0, "completions/mean_length": 1707.3333740234375, "completions/mean_terminated_length": 1707.3333740234375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.298168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 1.7474680191753578e-07, "kl": 0.0, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "num_tokens": 29872560.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 879 }, { "completion_length": 1907.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5257.0, "completions/max_terminated_length": 5257.0, "completions/mean_length": 1907.8333740234375, "completions/mean_terminated_length": 1907.8333740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.29850746268656714, "frac_reward_zero_std": 1.0, "grad_norm": 3.42220346283284e-07, "kl": 0.0, "learning_rate": 3.569703243616287e-07, "loss": 0.0, "num_tokens": 29907760.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 880 }, { "completion_length": 1137.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 1137.8333740234375, "completions/mean_terminated_length": 1137.8333740234375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.29884667571234735, "frac_reward_zero_std": 0.5, "grad_norm": 0.066347137093544, "kl": 0.0, "learning_rate": 3.5679779158040026e-07, "loss": 0.0011, "num_tokens": 29931818.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 881 }, { "completion_length": 1808.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3643.0, "completions/max_terminated_length": 3643.0, "completions/mean_length": 1808.0833740234375, "completions/mean_terminated_length": 1808.0833740234375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.29918588873812757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.566252587991718e-07, "loss": 0.0, "num_tokens": 29967603.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 882 }, { "completion_length": 1442.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 1442.0833740234375, "completions/mean_terminated_length": 1442.0833740234375, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.29952510176390773, "frac_reward_zero_std": 0.0, "grad_norm": 0.5057232975959778, "kl": 0.0, "learning_rate": 3.564527260179434e-07, "loss": 0.0043, "num_tokens": 30000094.0, "reward": 1.1041667461395264, "reward_std": 0.27090632915496826, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 883 }, { "completion_length": 1418.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 1418.3333740234375, "completions/mean_terminated_length": 1418.3333740234375, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 0.29986431478968795, "frac_reward_zero_std": 1.0, "grad_norm": 2.0655014054682397e-07, "kl": 0.0, "learning_rate": 3.562801932367149e-07, "loss": 0.0, "num_tokens": 30025184.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 884 }, { "completion_length": 926.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 926.25, "completions/mean_terminated_length": 926.25, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.3002035278154681, "frac_reward_zero_std": 0.5, "grad_norm": 0.08455709367990494, "kl": 0.0, "learning_rate": 3.5610766045548653e-07, "loss": 0.0011, "num_tokens": 30046109.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 885 }, { "completion_length": 983.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 983.0, "completions/mean_terminated_length": 983.0, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.3005427408412483, "frac_reward_zero_std": 1.0, "grad_norm": 2.2232033813907037e-07, "kl": 0.0, "learning_rate": 3.559351276742581e-07, "loss": 0.0, "num_tokens": 30072803.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 886 }, { "completion_length": 1092.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 1092.666748046875, "completions/mean_terminated_length": 1092.666748046875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.3008819538670285, "frac_reward_zero_std": 0.0, "grad_norm": 0.1025518849492073, "kl": 0.0, "learning_rate": 3.557625948930297e-07, "loss": -0.0005, "num_tokens": 30095977.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 887 }, { "completion_length": 1043.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1950.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 1043.3333740234375, "completions/mean_terminated_length": 1043.3333740234375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.3012211668928087, "frac_reward_zero_std": 1.0, "grad_norm": 1.7373888283600536e-07, "kl": 0.0, "learning_rate": 3.555900621118012e-07, "loss": 0.0, "num_tokens": 30121943.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 888 }, { "completion_length": 1952.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3501.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 1952.25, "completions/mean_terminated_length": 1952.25, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.30156037991858886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.554175293305728e-07, "loss": 0.0, "num_tokens": 30153266.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 889 }, { "completion_length": 1630.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3602.0, "completions/max_terminated_length": 3602.0, "completions/mean_length": 1630.666748046875, "completions/mean_terminated_length": 1630.666748046875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.3018995929443691, "frac_reward_zero_std": 0.5, "grad_norm": 0.09205930680036545, "kl": 0.0, "learning_rate": 3.5524499654934434e-07, "loss": -0.0002, "num_tokens": 30189412.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 890 }, { "completion_length": 1857.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3834.0, "completions/max_terminated_length": 3834.0, "completions/mean_length": 1857.916748046875, "completions/mean_terminated_length": 1857.916748046875, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.30223880597014924, "frac_reward_zero_std": 1.0, "grad_norm": 1.2736225585285865e-07, "kl": 0.0, "learning_rate": 3.5507246376811595e-07, "loss": 0.0, "num_tokens": 30221493.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 891 }, { "completion_length": 1551.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1551.75, "completions/mean_terminated_length": 1551.75, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.30257801899592945, "frac_reward_zero_std": 0.5, "grad_norm": 0.07093626260757446, "kl": 0.0, "learning_rate": 3.5489993098688745e-07, "loss": 0.001, "num_tokens": 30250272.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 892 }, { "completion_length": 2763.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5376.0, "completions/max_terminated_length": 5376.0, "completions/mean_length": 2763.58349609375, "completions/mean_terminated_length": 2763.58349609375, "completions/min_length": 1006.0, "completions/min_terminated_length": 1006.0, "epoch": 0.3029172320217096, "frac_reward_zero_std": 0.0, "grad_norm": 0.8445454835891724, "kl": 0.0, "learning_rate": 3.5472739820565906e-07, "loss": 0.0196, "num_tokens": 30297445.0, "reward": 1.0333333015441895, "reward_std": 0.4581989049911499, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 893 }, { "completion_length": 744.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 744.4166870117188, "completions/mean_terminated_length": 744.4166870117188, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.30325644504748983, "frac_reward_zero_std": 0.5, "grad_norm": 0.08562467247247696, "kl": 0.0, "learning_rate": 3.5455486542443066e-07, "loss": 0.0003, "num_tokens": 30324564.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 894 }, { "completion_length": 1995.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4017.0, "completions/max_terminated_length": 4017.0, "completions/mean_length": 1995.916748046875, "completions/mean_terminated_length": 1995.916748046875, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.30359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.11056003719568253, "kl": 0.0, "learning_rate": 3.543823326432022e-07, "loss": 0.0017, "num_tokens": 30357587.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 895 }, { "completion_length": 1346.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6012.0, "completions/max_terminated_length": 6012.0, "completions/mean_length": 1346.416748046875, "completions/mean_terminated_length": 1346.416748046875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.3039348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.997124433517456, "kl": 0.0, "learning_rate": 3.5420979986197377e-07, "loss": 0.0458, "num_tokens": 30385786.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 896 }, { "completion_length": 1527.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5303.0, "completions/max_terminated_length": 5303.0, "completions/mean_length": 1527.75, "completions/mean_terminated_length": 1527.75, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.30427408412483037, "frac_reward_zero_std": 0.5, "grad_norm": 0.4941687285900116, "kl": 0.0, "learning_rate": 3.540372670807453e-07, "loss": 0.0035, "num_tokens": 30416203.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 897 }, { "completion_length": 1980.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6500.0, "completions/max_terminated_length": 6500.0, "completions/mean_length": 1980.0, "completions/mean_terminated_length": 1980.0, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 0.3046132971506106, "frac_reward_zero_std": 0.5, "grad_norm": 0.8511844873428345, "kl": 0.0, "learning_rate": 3.538647342995169e-07, "loss": 0.0584, "num_tokens": 30449485.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 898 }, { "completion_length": 921.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 921.75, "completions/mean_terminated_length": 921.75, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.3049525101763908, "frac_reward_zero_std": 1.0, "grad_norm": 2.2145842137888394e-07, "kl": 0.0, "learning_rate": 3.536922015182884e-07, "loss": 0.0, "num_tokens": 30472606.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 899 }, { "completion_length": 2026.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5969.0, "completions/mean_length": 2575.25, "completions/mean_terminated_length": 2210.36376953125, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.30529172320217096, "frac_reward_zero_std": 0.5, "grad_norm": 0.5869353413581848, "kl": NaN, "learning_rate": 3.5351966873706003e-07, "loss": -0.0101, "num_tokens": 30508272.0, "reward": 0.9916666746139526, "reward_std": 0.2727941870689392, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 900 }, { "completion_length": 2621.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4830.0, "completions/max_terminated_length": 4830.0, "completions/mean_length": 2621.33349609375, "completions/mean_terminated_length": 2621.33349609375, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.3056309362279512, "frac_reward_zero_std": 1.0, "grad_norm": 2.387363906564133e-07, "kl": 0.0, "learning_rate": 3.533471359558316e-07, "loss": 0.0, "num_tokens": 30554278.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 901 }, { "completion_length": 1101.6666870117188, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5642.0, "completions/mean_length": 3298.0, "completions/mean_terminated_length": 1652.5, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.30597014925373134, "frac_reward_zero_std": 0.0, "grad_norm": 0.21892796456813812, "kl": NaN, "learning_rate": 3.531746031746032e-07, "loss": -0.0191, "num_tokens": 30574698.0, "reward": 0.7083333730697632, "reward_std": 0.10790684819221497, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.11965861916542053, "step": 902 }, { "completion_length": 1977.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 1977.3333740234375, "completions/mean_terminated_length": 1977.3333740234375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.30630936227951155, "frac_reward_zero_std": 0.0, "grad_norm": 0.15072372555732727, "kl": 0.0, "learning_rate": 3.530020703933747e-07, "loss": -0.0001, "num_tokens": 30611248.0, "reward": 1.2000000476837158, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 903 }, { "completion_length": 1630.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3331.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 1630.75, "completions/mean_terminated_length": 1630.75, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.3066485753052917, "frac_reward_zero_std": 0.5, "grad_norm": 0.10795415192842484, "kl": 0.0, "learning_rate": 3.528295376121463e-07, "loss": 0.0029, "num_tokens": 30643285.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 904 }, { "completion_length": 1505.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 1505.25, "completions/mean_terminated_length": 1505.25, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.30698778833107193, "frac_reward_zero_std": 0.5, "grad_norm": 0.32887357473373413, "kl": 0.0, "learning_rate": 3.5265700483091785e-07, "loss": 0.0141, "num_tokens": 30672184.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 905 }, { "completion_length": 1593.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3868.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 1593.0833740234375, "completions/mean_terminated_length": 1593.0833740234375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.3073270013568521, "frac_reward_zero_std": 0.5, "grad_norm": 0.7148328423500061, "kl": 0.0, "learning_rate": 3.5248447204968945e-07, "loss": -0.0257, "num_tokens": 30698309.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 906 }, { "completion_length": 1990.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4104.0, "completions/max_terminated_length": 4104.0, "completions/mean_length": 1990.8333740234375, "completions/mean_terminated_length": 1990.8333740234375, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.3076662143826323, "frac_reward_zero_std": 0.5, "grad_norm": 0.0934876799583435, "kl": 0.0, "learning_rate": 3.5231193926846095e-07, "loss": -0.0055, "num_tokens": 30735015.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 907 }, { "completion_length": 1272.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3085.0, "completions/max_terminated_length": 3085.0, "completions/mean_length": 1272.166748046875, "completions/mean_terminated_length": 1272.166748046875, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.30800542740841247, "frac_reward_zero_std": 0.5, "grad_norm": 0.0728154107928276, "kl": 0.0, "learning_rate": 3.5213940648723256e-07, "loss": -0.0013, "num_tokens": 30765851.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 908 }, { "completion_length": 1949.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5665.0, "completions/mean_length": 2498.666748046875, "completions/mean_terminated_length": 2126.818359375, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 0.3083446404341927, "frac_reward_zero_std": 0.5, "grad_norm": 0.2722416818141937, "kl": NaN, "learning_rate": 3.5196687370600417e-07, "loss": -0.0216, "num_tokens": 30799218.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 909 }, { "completion_length": 2033.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6482.0, "completions/mean_length": 3131.5, "completions/mean_terminated_length": 2440.0, "completions/min_length": 1208.0, "completions/min_terminated_length": 1208.0, "epoch": 0.30868385345997285, "frac_reward_zero_std": 0.5, "grad_norm": 0.22964391112327576, "kl": NaN, "learning_rate": 3.5179434092477567e-07, "loss": -0.0203, "num_tokens": 30830872.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 910 }, { "completion_length": 1930.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3964.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 1930.0833740234375, "completions/mean_terminated_length": 1930.0833740234375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.30902306648575306, "frac_reward_zero_std": 0.5, "grad_norm": 0.0788058415055275, "kl": 0.0, "learning_rate": 3.5162180814354727e-07, "loss": 0.0019, "num_tokens": 30866993.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 911 }, { "completion_length": 1843.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5078.0, "completions/max_terminated_length": 5078.0, "completions/mean_length": 1843.0, "completions/mean_terminated_length": 1843.0, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.3093622795115332, "frac_reward_zero_std": 0.0, "grad_norm": 0.17806807160377502, "kl": 0.0, "learning_rate": 3.514492753623188e-07, "loss": -0.0043, "num_tokens": 30903971.0, "reward": 1.2166666984558105, "reward_std": 0.10641197860240936, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 912 }, { "completion_length": 1276.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 1276.75, "completions/mean_terminated_length": 1276.75, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.30970149253731344, "frac_reward_zero_std": 1.0, "grad_norm": 1.0859728405421265e-07, "kl": 0.0, "learning_rate": 3.5127674258109043e-07, "loss": 0.0, "num_tokens": 30936494.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 913 }, { "completion_length": 1329.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 1329.75, "completions/mean_terminated_length": 1329.75, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.3100407055630936, "frac_reward_zero_std": 1.0, "grad_norm": 2.0788388610526454e-07, "kl": 0.0, "learning_rate": 3.5110420979986193e-07, "loss": 0.0, "num_tokens": 30962819.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 914 }, { "completion_length": 941.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 941.0, "completions/mean_terminated_length": 941.0, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.3103799185888738, "frac_reward_zero_std": 1.0, "grad_norm": 9.0740662983535e-08, "kl": 0.0, "learning_rate": 3.5093167701863354e-07, "loss": 0.0, "num_tokens": 30982991.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 915 }, { "completion_length": 1847.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 1847.5, "completions/mean_terminated_length": 1847.5, "completions/min_length": 1272.0, "completions/min_terminated_length": 1272.0, "epoch": 0.310719131614654, "frac_reward_zero_std": 0.0, "grad_norm": 0.5373735427856445, "kl": 0.0, "learning_rate": 3.507591442374051e-07, "loss": 0.0057, "num_tokens": 31018577.0, "reward": 0.44999998807907104, "reward_std": 0.36742347478866577, "rewards/correctness_reward_func/mean": 0.14999999105930328, "rewards/correctness_reward_func/std": 0.35290998220443726, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 916 }, { "completion_length": 1876.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5651.0, "completions/mean_length": 2974.166748046875, "completions/mean_terminated_length": 2251.199951171875, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.3110583446404342, "frac_reward_zero_std": 0.0, "grad_norm": 0.8309308290481567, "kl": NaN, "learning_rate": 3.505866114561767e-07, "loss": -0.0607, "num_tokens": 31055465.0, "reward": 0.5833333730697632, "reward_std": 0.4858439564704895, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 917 }, { "completion_length": 1174.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 1174.916748046875, "completions/mean_terminated_length": 1174.916748046875, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.3113975576662144, "frac_reward_zero_std": 0.5, "grad_norm": 0.10468626022338867, "kl": 0.0, "learning_rate": 3.504140786749482e-07, "loss": -0.0008, "num_tokens": 31078678.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 918 }, { "completion_length": 1952.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5511.0, "completions/max_terminated_length": 5511.0, "completions/mean_length": 1952.416748046875, "completions/mean_terminated_length": 1952.416748046875, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.31173677069199457, "frac_reward_zero_std": 0.5, "grad_norm": 0.5677090287208557, "kl": 0.0, "learning_rate": 3.502415458937198e-07, "loss": -0.0326, "num_tokens": 31110243.0, "reward": 0.5166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.21666665375232697, "rewards/correctness_reward_func/std": 0.39504507184028625, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 919 }, { "completion_length": 813.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 813.5, "completions/mean_terminated_length": 813.5, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.3120759837177748, "frac_reward_zero_std": 0.5, "grad_norm": 0.06546156108379364, "kl": 0.0, "learning_rate": 3.5006901311249135e-07, "loss": 0.001, "num_tokens": 31134165.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 920 }, { "completion_length": 829.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 829.5, "completions/mean_terminated_length": 829.5, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.31241519674355495, "frac_reward_zero_std": 0.5, "grad_norm": 0.0700530856847763, "kl": 0.0, "learning_rate": 3.498964803312629e-07, "loss": -0.003, "num_tokens": 31155189.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 921 }, { "completion_length": 1294.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1294.166748046875, "completions/mean_terminated_length": 1294.166748046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.31275440976933516, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272267997264862, "kl": 0.0, "learning_rate": 3.4972394755003446e-07, "loss": -0.002, "num_tokens": 31176545.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 922 }, { "completion_length": 1220.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1220.916748046875, "completions/mean_terminated_length": 1220.916748046875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.3130936227951153, "frac_reward_zero_std": 0.5, "grad_norm": 0.055474903434515, "kl": 0.0, "learning_rate": 3.4955141476880606e-07, "loss": 0.0003, "num_tokens": 31205242.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 923 }, { "completion_length": 1090.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 1090.0833740234375, "completions/mean_terminated_length": 1090.0833740234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.31343283582089554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4937888198757767e-07, "loss": 0.0, "num_tokens": 31232585.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 924 }, { "completion_length": 1138.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4259.0, "completions/max_terminated_length": 4259.0, "completions/mean_length": 1138.3333740234375, "completions/mean_terminated_length": 1138.3333740234375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.3137720488466757, "frac_reward_zero_std": 0.5, "grad_norm": 0.37474891543388367, "kl": 0.0, "learning_rate": 3.4920634920634917e-07, "loss": -0.0011, "num_tokens": 31258053.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 925 }, { "completion_length": 977.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 977.25, "completions/mean_terminated_length": 977.25, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.3141112618724559, "frac_reward_zero_std": 1.0, "grad_norm": 9.163004222045856e-08, "kl": 0.0, "learning_rate": 3.490338164251208e-07, "loss": 0.0, "num_tokens": 31285440.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 926 }, { "completion_length": 1147.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 1147.0, "completions/mean_terminated_length": 1147.0, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.3144504748982361, "frac_reward_zero_std": 1.0, "grad_norm": 1.1602305249880374e-07, "kl": 0.0, "learning_rate": 3.4886128364389233e-07, "loss": 0.0, "num_tokens": 31308624.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 927 }, { "completion_length": 745.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 745.0833740234375, "completions/mean_terminated_length": 745.0833740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.3147896879240163, "frac_reward_zero_std": 1.0, "grad_norm": 1.0053457799585885e-07, "kl": 0.0, "learning_rate": 3.4868875086266393e-07, "loss": 0.0, "num_tokens": 31327321.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 928 }, { "completion_length": 883.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 883.0, "completions/mean_terminated_length": 883.0, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.31512890094979645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4851621808143543e-07, "loss": 0.0, "num_tokens": 31351351.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 929 }, { "completion_length": 1326.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3650.0, "completions/max_terminated_length": 3650.0, "completions/mean_length": 1326.5833740234375, "completions/mean_terminated_length": 1326.5833740234375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.31546811397557667, "frac_reward_zero_std": 0.5, "grad_norm": 0.2548946440219879, "kl": 0.0, "learning_rate": 3.4834368530020704e-07, "loss": 0.0018, "num_tokens": 31381022.0, "reward": 1.0500000715255737, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 930 }, { "completion_length": 4050.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6140.0, "completions/mean_length": 4599.25, "completions/mean_terminated_length": 4418.36376953125, "completions/min_length": 1481.0, "completions/min_terminated_length": 1481.0, "epoch": 0.31580732700135683, "frac_reward_zero_std": 0.5, "grad_norm": 0.3112356662750244, "kl": NaN, "learning_rate": 3.481711525189786e-07, "loss": -0.0497, "num_tokens": 31438420.0, "reward": 0.6791666746139526, "reward_std": 0.2609677314758301, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 931 }, { "completion_length": 1440.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1440.416748046875, "completions/mean_terminated_length": 1440.416748046875, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 0.31614654002713705, "frac_reward_zero_std": 0.5, "grad_norm": 0.07152718305587769, "kl": 0.0, "learning_rate": 3.4799861973775015e-07, "loss": -0.0004, "num_tokens": 31466607.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 932 }, { "completion_length": 913.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 913.1666870117188, "completions/mean_terminated_length": 913.1666870117188, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.3164857530529172, "frac_reward_zero_std": 0.5, "grad_norm": 0.03750376030802727, "kl": 0.0, "learning_rate": 3.478260869565217e-07, "loss": 0.0, "num_tokens": 31491683.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 933 }, { "completion_length": 615.5833587646484, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 3910.08349609375, "completions/mean_terminated_length": 1231.166748046875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.3168249660786974, "frac_reward_zero_std": 0.5, "grad_norm": 0.052773211151361465, "kl": NaN, "learning_rate": 3.476535541752933e-07, "loss": -0.0011, "num_tokens": 31510014.0, "reward": 0.6333333253860474, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 934 }, { "completion_length": 1668.75, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5946.0, "completions/mean_length": 3316.0, "completions/mean_terminated_length": 2225.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.31716417910447764, "frac_reward_zero_std": 0.0, "grad_norm": 0.8342804312705994, "kl": NaN, "learning_rate": 3.4748102139406486e-07, "loss": -0.1025, "num_tokens": 31543809.0, "reward": 0.845833420753479, "reward_std": 0.5563273429870605, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 935 }, { "completion_length": 444.75, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 444.75, "completions/mean_terminated_length": 444.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3175033921302578, "frac_reward_zero_std": 0.5, "grad_norm": 0.06688177585601807, "kl": 0.0, "learning_rate": 3.473084886128364e-07, "loss": -0.0, "num_tokens": 31563336.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 936 }, { "completion_length": 1351.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 1351.416748046875, "completions/mean_terminated_length": 1351.416748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.317842605156038, "frac_reward_zero_std": 0.5, "grad_norm": 0.477865070104599, "kl": 0.0, "learning_rate": 3.4713595583160796e-07, "loss": 0.0164, "num_tokens": 31591571.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 937 }, { "completion_length": 2661.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5988.0, "completions/mean_length": 4308.4169921875, "completions/mean_terminated_length": 3548.22216796875, "completions/min_length": 1966.0, "completions/min_terminated_length": 1966.0, "epoch": 0.3181818181818182, "frac_reward_zero_std": 0.5, "grad_norm": 1.4337899684906006, "kl": NaN, "learning_rate": 3.4696342305037957e-07, "loss": -0.0686, "num_tokens": 31636261.0, "reward": 0.7083333730697632, "reward_std": 0.2518266439437866, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 938 }, { "completion_length": 688.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 688.75, "completions/mean_terminated_length": 688.75, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.3185210312075984, "frac_reward_zero_std": 0.0, "grad_norm": 0.09581376612186432, "kl": 0.0, "learning_rate": 3.467908902691512e-07, "loss": -0.0006, "num_tokens": 31659796.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 939 }, { "completion_length": 1968.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5840.0, "completions/mean_length": 3615.416748046875, "completions/mean_terminated_length": 2624.22216796875, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "epoch": 0.31886024423337855, "frac_reward_zero_std": 0.0, "grad_norm": 0.884489893913269, "kl": NaN, "learning_rate": 3.466183574879227e-07, "loss": -0.1014, "num_tokens": 31698522.0, "reward": 0.5916666984558105, "reward_std": 0.5406736135482788, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 940 }, { "completion_length": 1261.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2849.0, "completions/mean_length": 1810.25, "completions/mean_terminated_length": 1375.8182373046875, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 0.31919945725915877, "frac_reward_zero_std": 0.5, "grad_norm": 0.24075216054916382, "kl": NaN, "learning_rate": 3.464458247066943e-07, "loss": -0.0164, "num_tokens": 31727606.0, "reward": 1.1083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 941 }, { "completion_length": 985.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 985.25, "completions/mean_terminated_length": 985.25, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.31953867028493893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4627329192546583e-07, "loss": 0.0, "num_tokens": 31750769.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 942 }, { "completion_length": 887.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 887.9166870117188, "completions/mean_terminated_length": 887.9166870117188, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.31987788331071915, "frac_reward_zero_std": 1.0, "grad_norm": 2.1579033671059733e-07, "kl": 0.0, "learning_rate": 3.461007591442374e-07, "loss": 0.0, "num_tokens": 31775662.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 943 }, { "completion_length": 1304.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3547.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 1304.5, "completions/mean_terminated_length": 1304.5, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.3202170963364993, "frac_reward_zero_std": 1.0, "grad_norm": 2.3239888946591236e-07, "kl": 0.0, "learning_rate": 3.4592822636300894e-07, "loss": 0.0, "num_tokens": 31795600.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 944 }, { "completion_length": 697.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 697.75, "completions/mean_terminated_length": 697.75, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.3205563093622795, "frac_reward_zero_std": 0.5, "grad_norm": 0.2733204960823059, "kl": 0.0, "learning_rate": 3.4575569358178054e-07, "loss": -0.0005, "num_tokens": 31818679.0, "reward": 1.2000000476837158, "reward_std": 0.19999998807907104, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 945 }, { "completion_length": 2257.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6290.0, "completions/max_terminated_length": 6290.0, "completions/mean_length": 2257.416748046875, "completions/mean_terminated_length": 2257.416748046875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.3208955223880597, "frac_reward_zero_std": 0.0, "grad_norm": 0.6494872570037842, "kl": 0.0, "learning_rate": 3.455831608005521e-07, "loss": -0.0207, "num_tokens": 31862034.0, "reward": 0.7833333015441895, "reward_std": 0.36742347478866577, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 946 }, { "completion_length": 1796.0000915527344, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4971.0, "completions/mean_length": 2345.08349609375, "completions/mean_terminated_length": 1959.2728271484375, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.3212347354138399, "frac_reward_zero_std": 0.0, "grad_norm": 0.12197453528642654, "kl": NaN, "learning_rate": 3.4541062801932365e-07, "loss": -0.0085, "num_tokens": 31895976.0, "reward": 0.7458333373069763, "reward_std": 0.10710843652486801, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 947 }, { "completion_length": 608.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 608.1666870117188, "completions/mean_terminated_length": 608.1666870117188, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.32157394843962006, "frac_reward_zero_std": 1.0, "grad_norm": 1.0008302098185595e-07, "kl": 0.0, "learning_rate": 3.452380952380952e-07, "loss": 0.0, "num_tokens": 31916744.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 948 }, { "completion_length": 878.2500610351562, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4383.0, "completions/mean_length": 3623.666748046875, "completions/mean_terminated_length": 1505.571533203125, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.3219131614654003, "frac_reward_zero_std": 0.5, "grad_norm": 0.11442892253398895, "kl": NaN, "learning_rate": 3.450655624568668e-07, "loss": -0.0083, "num_tokens": 31940219.0, "reward": 0.7250000238418579, "reward_std": 0.06708204001188278, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.11965861171483994, "step": 949 }, { "completion_length": 1511.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3411.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 1511.666748046875, "completions/mean_terminated_length": 1511.666748046875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 0.32225237449118044, "frac_reward_zero_std": 1.0, "grad_norm": 1.4038799633908638e-07, "kl": 0.0, "learning_rate": 3.4489302967563836e-07, "loss": 0.0, "num_tokens": 31968571.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 950 }, { "completion_length": 1021.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 1021.1666870117188, "completions/mean_terminated_length": 1021.1666870117188, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 0.32259158751696065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.447204968944099e-07, "loss": 0.0, "num_tokens": 31991547.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 951 }, { "completion_length": 739.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 739.6666870117188, "completions/mean_terminated_length": 739.6666870117188, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.3229308005427408, "frac_reward_zero_std": 1.0, "grad_norm": 1.2268769467027596e-07, "kl": 0.0, "learning_rate": 3.4454796411318147e-07, "loss": 0.0, "num_tokens": 32012159.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 952 }, { "completion_length": 1022.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2877.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 1022.5, "completions/mean_terminated_length": 1022.5, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.32327001356852103, "frac_reward_zero_std": 0.5, "grad_norm": 0.05467428267002106, "kl": 0.0, "learning_rate": 3.4437543133195307e-07, "loss": -0.0002, "num_tokens": 32035457.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 953 }, { "completion_length": 2420.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5646.0, "completions/max_terminated_length": 5646.0, "completions/mean_length": 2420.416748046875, "completions/mean_terminated_length": 2420.416748046875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.32360922659430125, "frac_reward_zero_std": 0.5, "grad_norm": 0.4149426221847534, "kl": 0.0, "learning_rate": 3.4420289855072457e-07, "loss": -0.0127, "num_tokens": 32074594.0, "reward": 0.7541667819023132, "reward_std": 0.13268069922924042, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 954 }, { "completion_length": 832.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 832.0833740234375, "completions/mean_terminated_length": 832.0833740234375, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.3239484396200814, "frac_reward_zero_std": 0.5, "grad_norm": 0.08572785556316376, "kl": 0.0, "learning_rate": 3.440303657694962e-07, "loss": 0.0007, "num_tokens": 32092451.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 955 }, { "completion_length": 1114.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 1114.416748046875, "completions/mean_terminated_length": 1114.416748046875, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.3242876526458616, "frac_reward_zero_std": 1.0, "grad_norm": 2.0851454962667049e-07, "kl": 0.0, "learning_rate": 3.438578329882678e-07, "loss": 0.0, "num_tokens": 32116522.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 956 }, { "completion_length": 2551.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 3836.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 2551.08349609375, "completions/mean_terminated_length": 2551.08349609375, "completions/min_length": 1350.0, "completions/min_terminated_length": 1350.0, "epoch": 0.3246268656716418, "frac_reward_zero_std": 0.0, "grad_norm": 0.6476059556007385, "kl": 0.0, "learning_rate": 3.4368530020703934e-07, "loss": -0.0199, "num_tokens": 32160011.0, "reward": 0.7666666507720947, "reward_std": 0.36329931020736694, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 957 }, { "completion_length": 2170.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4740.0, "completions/max_terminated_length": 4740.0, "completions/mean_length": 2170.58349609375, "completions/mean_terminated_length": 2170.58349609375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.324966078697422, "frac_reward_zero_std": 0.5, "grad_norm": 0.45245659351348877, "kl": 0.0, "learning_rate": 3.435127674258109e-07, "loss": -0.0042, "num_tokens": 32202528.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 958 }, { "completion_length": 1702.166748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4745.0, "completions/mean_length": 3898.5, "completions/mean_terminated_length": 2553.25, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.32530529172320216, "frac_reward_zero_std": 0.0, "grad_norm": 0.7316205501556396, "kl": NaN, "learning_rate": 3.4334023464458244e-07, "loss": -0.0585, "num_tokens": 32236376.0, "reward": 0.8000000715255737, "reward_std": 0.33565855026245117, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 959 }, { "completion_length": 1313.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3487.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 1313.0833740234375, "completions/mean_terminated_length": 1313.0833740234375, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.3256445047489824, "frac_reward_zero_std": 0.0, "grad_norm": 0.10751835256814957, "kl": 0.0, "learning_rate": 3.4316770186335405e-07, "loss": 0.001, "num_tokens": 32263551.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 960 }, { "completion_length": 672.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 672.5, "completions/mean_terminated_length": 672.5, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.32598371777476254, "frac_reward_zero_std": 1.0, "grad_norm": 1.0762104096784242e-07, "kl": 0.0, "learning_rate": 3.429951690821256e-07, "loss": 0.0, "num_tokens": 32283729.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 961 }, { "completion_length": 1622.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 1622.75, "completions/mean_terminated_length": 1622.75, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.32632293080054275, "frac_reward_zero_std": 0.5, "grad_norm": 0.08299823850393295, "kl": 0.0, "learning_rate": 3.4282263630089715e-07, "loss": -0.0008, "num_tokens": 32315124.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 962 }, { "completion_length": 2098.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3588.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 2098.08349609375, "completions/mean_terminated_length": 2098.08349609375, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.3266621438263229, "frac_reward_zero_std": 0.0, "grad_norm": 0.5058161616325378, "kl": 0.0, "learning_rate": 3.426501035196687e-07, "loss": -0.0014, "num_tokens": 32354557.0, "reward": 0.8500000834465027, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.49082493782043457, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 963 }, { "completion_length": 1694.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3257.0, "completions/max_terminated_length": 3257.0, "completions/mean_length": 1694.75, "completions/mean_terminated_length": 1694.75, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.32700135685210313, "frac_reward_zero_std": 0.5, "grad_norm": 0.06396138668060303, "kl": 0.0, "learning_rate": 3.424775707384403e-07, "loss": 0.0003, "num_tokens": 32387842.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 964 }, { "completion_length": 1904.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6139.0, "completions/max_terminated_length": 6139.0, "completions/mean_length": 1904.8333740234375, "completions/mean_terminated_length": 1904.8333740234375, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.3273405698778833, "frac_reward_zero_std": 0.5, "grad_norm": 0.1909196823835373, "kl": 0.0, "learning_rate": 3.423050379572118e-07, "loss": 0.0087, "num_tokens": 32422652.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 965 }, { "completion_length": 1142.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 1142.416748046875, "completions/mean_terminated_length": 1142.416748046875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.3276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 3.0000242645655817e-07, "kl": 0.0, "learning_rate": 3.421325051759834e-07, "loss": 0.0, "num_tokens": 32446513.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 966 }, { "completion_length": 2232.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5776.0, "completions/mean_length": 2781.666748046875, "completions/mean_terminated_length": 2435.54541015625, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.32801899592944367, "frac_reward_zero_std": 0.5, "grad_norm": 0.3459160327911377, "kl": NaN, "learning_rate": 3.4195997239475497e-07, "loss": -0.0268, "num_tokens": 32487488.0, "reward": 1.1541666984558105, "reward_std": 0.21588000655174255, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 967 }, { "completion_length": 1698.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4551.0, "completions/max_terminated_length": 4551.0, "completions/mean_length": 1698.416748046875, "completions/mean_terminated_length": 1698.416748046875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.3283582089552239, "frac_reward_zero_std": 0.0, "grad_norm": 0.6486458778381348, "kl": 0.0, "learning_rate": 3.417874396135266e-07, "loss": 0.0035, "num_tokens": 32516797.0, "reward": 1.0500000715255737, "reward_std": 0.299967497587204, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 968 }, { "completion_length": 1960.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4384.0, "completions/max_terminated_length": 4384.0, "completions/mean_length": 1960.3333740234375, "completions/mean_terminated_length": 1960.3333740234375, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.32869742198100405, "frac_reward_zero_std": 0.5, "grad_norm": 0.4522709548473358, "kl": 0.0, "learning_rate": 3.416149068322981e-07, "loss": -0.0257, "num_tokens": 32551721.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 969 }, { "completion_length": 1291.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1291.416748046875, "completions/mean_terminated_length": 1291.416748046875, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.32903663500678426, "frac_reward_zero_std": 0.5, "grad_norm": 0.5910433530807495, "kl": 0.0, "learning_rate": 3.414423740510697e-07, "loss": 0.0022, "num_tokens": 32579584.0, "reward": 1.0333335399627686, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 970 }, { "completion_length": 2047.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5423.0, "completions/mean_length": 3694.916748046875, "completions/mean_terminated_length": 2730.22216796875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.3293758480325645, "frac_reward_zero_std": 0.0, "grad_norm": 0.6515673398971558, "kl": NaN, "learning_rate": 3.412698412698413e-07, "loss": 0.0215, "num_tokens": 32614278.0, "reward": 0.3916666507720947, "reward_std": 0.34035724401474, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 971 }, { "completion_length": 1830.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4031.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1830.3333740234375, "completions/mean_terminated_length": 1830.3333740234375, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "epoch": 0.32971506105834464, "frac_reward_zero_std": 0.0, "grad_norm": 0.1503661572933197, "kl": 0.0, "learning_rate": 3.4109730848861284e-07, "loss": 0.0047, "num_tokens": 32649274.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 972 }, { "completion_length": 797.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 797.8333740234375, "completions/mean_terminated_length": 797.8333740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.33005427408412485, "frac_reward_zero_std": 1.0, "grad_norm": 2.0924046850723244e-07, "kl": 0.0, "learning_rate": 3.409247757073844e-07, "loss": 0.0, "num_tokens": 32672906.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 973 }, { "completion_length": 1648.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4696.0, "completions/max_terminated_length": 4696.0, "completions/mean_length": 1648.0, "completions/mean_terminated_length": 1648.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.330393487109905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4075224292615595e-07, "loss": 0.0, "num_tokens": 32709692.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 974 }, { "completion_length": 795.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 795.8333740234375, "completions/mean_terminated_length": 795.8333740234375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.33073270013568523, "frac_reward_zero_std": 1.0, "grad_norm": 8.773958626306921e-08, "kl": 0.0, "learning_rate": 3.4057971014492755e-07, "loss": 0.0, "num_tokens": 32731848.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 975 }, { "completion_length": 2053.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5526.0, "completions/mean_length": 2602.75, "completions/mean_terminated_length": 2240.36376953125, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.3310719131614654, "frac_reward_zero_std": 0.0, "grad_norm": 0.41066351532936096, "kl": NaN, "learning_rate": 3.4040717736369905e-07, "loss": 0.0118, "num_tokens": 32770622.0, "reward": 0.8583333492279053, "reward_std": 0.28804606199264526, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 976 }, { "completion_length": 4053.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6396.0, "completions/max_terminated_length": 6396.0, "completions/mean_length": 4053.58349609375, "completions/mean_terminated_length": 4053.58349609375, "completions/min_length": 2075.0, "completions/min_terminated_length": 2075.0, "epoch": 0.3314111261872456, "frac_reward_zero_std": 1.0, "grad_norm": 2.8918020689161494e-07, "kl": 0.0, "learning_rate": 3.4023464458247066e-07, "loss": 0.0, "num_tokens": 32829225.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 977 }, { "completion_length": 1234.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4258.0, "completions/max_terminated_length": 4258.0, "completions/mean_length": 1234.166748046875, "completions/mean_terminated_length": 1234.166748046875, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.33175033921302577, "frac_reward_zero_std": 0.5, "grad_norm": 0.6808133125305176, "kl": 0.0, "learning_rate": 3.400621118012422e-07, "loss": 0.026, "num_tokens": 32852561.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 978 }, { "completion_length": 2129.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3540.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 2129.416748046875, "completions/mean_terminated_length": 2129.416748046875, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.332089552238806, "frac_reward_zero_std": 1.0, "grad_norm": 2.425616116852325e-07, "kl": 0.0, "learning_rate": 3.398895790200138e-07, "loss": 0.0, "num_tokens": 32892274.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 979 }, { "completion_length": 1461.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 2010.166748046875, "completions/mean_terminated_length": 1593.9091796875, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.33242876526458615, "frac_reward_zero_std": 0.5, "grad_norm": 0.22468790411949158, "kl": NaN, "learning_rate": 3.397170462387853e-07, "loss": -0.0282, "num_tokens": 32921297.0, "reward": 1.2041666507720947, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 980 }, { "completion_length": 2105.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6066.0, "completions/max_terminated_length": 6066.0, "completions/mean_length": 2105.416748046875, "completions/mean_terminated_length": 2105.416748046875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.33276797829036636, "frac_reward_zero_std": 1.0, "grad_norm": 2.466020134761493e-07, "kl": 0.0, "learning_rate": 3.395445134575569e-07, "loss": 0.0, "num_tokens": 32956858.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 981 }, { "completion_length": 1458.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3091.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 1458.416748046875, "completions/mean_terminated_length": 1458.416748046875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.3331071913161465, "frac_reward_zero_std": 1.0, "grad_norm": 1.5790541851856688e-07, "kl": 0.0, "learning_rate": 3.393719806763285e-07, "loss": 0.0, "num_tokens": 32984493.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 982 }, { "completion_length": 1025.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1025.666748046875, "completions/mean_terminated_length": 1025.666748046875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.33344640434192674, "frac_reward_zero_std": 0.5, "grad_norm": 0.08788394927978516, "kl": 0.0, "learning_rate": 3.391994478951001e-07, "loss": 0.0007, "num_tokens": 33010637.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 983 }, { "completion_length": 2536.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4205.0, "completions/max_terminated_length": 4205.0, "completions/mean_length": 2536.416748046875, "completions/mean_terminated_length": 2536.416748046875, "completions/min_length": 1341.0, "completions/min_terminated_length": 1341.0, "epoch": 0.3337856173677069, "frac_reward_zero_std": 0.5, "grad_norm": 0.15756388008594513, "kl": 0.0, "learning_rate": 3.390269151138716e-07, "loss": -0.0005, "num_tokens": 33048778.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 984 }, { "completion_length": 832.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 832.4166870117188, "completions/mean_terminated_length": 832.4166870117188, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.3341248303934871, "frac_reward_zero_std": 0.5, "grad_norm": 0.04834370315074921, "kl": 0.0, "learning_rate": 3.388543823326432e-07, "loss": -0.0004, "num_tokens": 33074493.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 985 }, { "completion_length": 1010.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 1010.5833740234375, "completions/mean_terminated_length": 1010.5833740234375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.3344640434192673, "frac_reward_zero_std": 1.0, "grad_norm": 1.7334615165509604e-07, "kl": 0.0, "learning_rate": 3.386818495514148e-07, "loss": 0.0, "num_tokens": 33099076.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 986 }, { "completion_length": 1127.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3103.0, "completions/max_terminated_length": 3103.0, "completions/mean_length": 1127.166748046875, "completions/mean_terminated_length": 1127.166748046875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.3348032564450475, "frac_reward_zero_std": 0.5, "grad_norm": 0.3109779357910156, "kl": 0.0, "learning_rate": 3.385093167701863e-07, "loss": -0.0022, "num_tokens": 33124332.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 987 }, { "completion_length": 1063.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 1063.75, "completions/mean_terminated_length": 1063.75, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.33514246947082765, "frac_reward_zero_std": 0.5, "grad_norm": 0.540890634059906, "kl": 0.0, "learning_rate": 3.383367839889579e-07, "loss": 0.0257, "num_tokens": 33149661.0, "reward": 1.1166666746139526, "reward_std": 0.24832773208618164, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 988 }, { "completion_length": 995.5833435058594, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 4290.08349609375, "completions/mean_terminated_length": 1991.166748046875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.33548168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.3816425120772945e-07, "loss": 0.0, "num_tokens": 33172666.0, "reward": 0.6499999761581421, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 989 }, { "completion_length": 1330.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 1330.916748046875, "completions/mean_terminated_length": 1330.916748046875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.3358208955223881, "frac_reward_zero_std": 0.5, "grad_norm": 0.36015501618385315, "kl": 0.0, "learning_rate": 3.3799171842650106e-07, "loss": 0.0031, "num_tokens": 33200667.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 990 }, { "completion_length": 1213.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 1213.75, "completions/mean_terminated_length": 1213.75, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.33616010854816825, "frac_reward_zero_std": 1.0, "grad_norm": 1.6253594026238716e-07, "kl": 0.0, "learning_rate": 3.3781918564527256e-07, "loss": 0.0, "num_tokens": 33226422.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 991 }, { "completion_length": 1514.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3412.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 1514.75, "completions/mean_terminated_length": 1514.75, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.33649932157394846, "frac_reward_zero_std": 1.0, "grad_norm": 1.840590471147152e-07, "kl": 0.0, "learning_rate": 3.3764665286404416e-07, "loss": 0.0, "num_tokens": 33257235.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 992 }, { "completion_length": 794.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 794.75, "completions/mean_terminated_length": 794.75, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.3368385345997286, "frac_reward_zero_std": 0.5, "grad_norm": 0.06748513877391815, "kl": 0.0, "learning_rate": 3.374741200828157e-07, "loss": 0.0, "num_tokens": 33277566.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 993 }, { "completion_length": 610.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 610.9166870117188, "completions/mean_terminated_length": 610.9166870117188, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.33717774762550884, "frac_reward_zero_std": 0.5, "grad_norm": 0.049711357802152634, "kl": 0.0, "learning_rate": 3.373015873015873e-07, "loss": 0.0005, "num_tokens": 33298025.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 994 }, { "completion_length": 1622.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4141.0, "completions/max_terminated_length": 4141.0, "completions/mean_length": 1622.416748046875, "completions/mean_terminated_length": 1622.416748046875, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.337516960651289, "frac_reward_zero_std": 0.5, "grad_norm": 0.4773035943508148, "kl": 0.0, "learning_rate": 3.371290545203588e-07, "loss": 0.0057, "num_tokens": 33325894.0, "reward": 1.1000001430511475, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 995 }, { "completion_length": 787.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 787.25, "completions/mean_terminated_length": 787.25, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.3378561736770692, "frac_reward_zero_std": 1.0, "grad_norm": 9.43017894883269e-08, "kl": 0.0, "learning_rate": 3.369565217391304e-07, "loss": 0.0, "num_tokens": 33346219.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 996 }, { "completion_length": 887.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 887.0833740234375, "completions/mean_terminated_length": 887.0833740234375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.3381953867028494, "frac_reward_zero_std": 1.0, "grad_norm": 1.7791666095945402e-07, "kl": 0.0, "learning_rate": 3.36783988957902e-07, "loss": 0.0, "num_tokens": 33373244.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 997 }, { "completion_length": 894.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 894.1666870117188, "completions/mean_terminated_length": 894.1666870117188, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.3385345997286296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.3661145617667353e-07, "loss": 0.0, "num_tokens": 33394702.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 998 }, { "completion_length": 802.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 802.0833740234375, "completions/mean_terminated_length": 802.0833740234375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.33887381275440975, "frac_reward_zero_std": 0.0, "grad_norm": 0.1298748254776001, "kl": 0.0, "learning_rate": 3.364389233954451e-07, "loss": 0.0004, "num_tokens": 33415397.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 999 }, { "completion_length": 594.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 594.8333740234375, "completions/mean_terminated_length": 594.8333740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.33921302578018997, "frac_reward_zero_std": 1.0, "grad_norm": 1.740247910220205e-07, "kl": 0.0, "learning_rate": 3.362663906142167e-07, "loss": 0.0, "num_tokens": 33435015.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1000 } ], "logging_steps": 1, "max_steps": 2948, "num_input_tokens_seen": 33435015, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }